In [2]:
import pandas as pd

# Đọc dữ liệu từ file CSV
a = pd.read_csv('/content/Vector News.csv')

# Đổi tên cột 'Summary' thành 'Description'
a.rename(columns={'Summary': 'Description', 'Datetime': 'Time'}, inplace=True)

# Định dạng lại cột 'Time' thành 'YYYY-MM-DD'
a['Time'] = pd.to_datetime(a['Time'], format='%B %d, %Y - %H:%M').dt.strftime('%Y-%m-%d')

# Bỏ cột 'Link'
a.drop(columns=['Link'], inplace=True)

# Lưu kết quả vào file CSV mới
a.to_csv('/content/News_postprocess.csv', index=False)


In [4]:
!python -m nltk.downloader vader_lexicon


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import BertForSequenceClassification, BertTokenizer, pipeline
import time

# Đảm bảo nltk và transformers đã được cài đặt
nltk.download('punkt')

start = time.time()

# Đọc dữ liệu từ file CSV
try:
    combined_df = pd.read_csv('/content/News_postprocess.csv')
except FileNotFoundError:
    print("File không tồn tại, vui lòng kiểm tra đường dẫn.")
    exit()

# Kiểm tra và xử lý dữ liệu thiếu
combined_df.dropna(subset=['Description'], inplace=True)

# Tạo sentiment analyzer cho VADER
vader_analyzer = SentimentIntensityAnalyzer()

# Tạo sentiment analyzer cho FinBERT
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

# Hàm tính toán điểm trung bình của các câu trong một tin tức
def calculate_average_sentiment(texts):
    times = len(texts)
    polarity = 0
    subjectivity = 0
    com_avg = 0
    fin_avg = 0

    for text in texts:
        # Calculate sentiment using TextBlob
        blob = TextBlob(text)
        polarity += blob.sentiment.polarity
        subjectivity += blob.sentiment.subjectivity

        # Calculate sentiment using VADER
        ss = vader_analyzer.polarity_scores(text)
        com_avg += ss['compound']

        # Calculate sentiment using FinBERT
        results = nlp(text)
        label = results[0]['label']
        score = round((results[0]['score']), 4)
        fin_avg += score * (1 if label == 'Positive' else -1 if label == 'Negative' else 0)

    polarity_avg = round(polarity / times, 4)
    subjectivity_avg = round(subjectivity / times, 4)
    nltk_score = round(com_avg / times, 4)

    finBERT_score = round(fin_avg / times, 4)

    return polarity_avg, subjectivity_avg, nltk_score, finBERT_score


# Áp dụng hàm tính toán điểm trung bình cho mỗi tin tức
combined_df['Polarity_avg'], combined_df['Subjectivity_avg'], combined_df['Nltk_Sentiment'], combined_df['FinBERT_Sentiment'] = zip(*combined_df['Description'].apply(lambda text: calculate_average_sentiment(nltk.sent_tokenize(text))))

# Lưu DataFrame đã cập nhật vào một file CSV mới
updated_csv_path = 'News_all_sentiments.csv'
combined_df.to_csv(updated_csv_path, index=False)

print(f"Kết quả phân tích cảm xúc đã được lưu vào {updated_csv_path}")

end = time.time()
print("Thời gian: %.2f giây" % (end - start))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import pandas as pd
new = pd.read_csv('News_all_sentiments.csv')
new.head(20)

Unnamed: 0,Title,Time,Description,Polarity_avg,Subjectivity_avg,Nltk_Sentiment,FinBERT_Sentiment
0,An Giang bosters border economy,2024-05-02,Under a project to develop the province’s bord...,0.16,0.52,0.6597,0.9992
1,Digital transformation events for banking sect...,2024-05-01,This initiative aligns with the goals of the N...,0.0,0.0,0.25,0.9326
2,Việt Nam secures remarkable economic successes...,2024-04-29,"Back to 1975, Việt Nam, torn by two major wars...",-0.0729,0.1667,-0.3404,0.0
3,PM inspects drought combat in Ninh Thuận,2024-04-28,PM Chính asked the province to carry out its m...,-0.0667,0.1,0.4404,0.0
4,Policies needed to encourage e-commerce to emb...,2024-04-27,A recent report on plastic wastes from e-comme...,-0.03,0.31,0.6808,-0.9999
5,​​​​​​​VN needs drastic reforms to create firm...,2024-04-26,Việt Nam’s economy has a good start with a gro...,0.1375,0.4333,0.3352,0.0006
6,Keeping CPI low key to inflation control this ...,2024-04-25,Deputy Minister of Finance Nguyễn Đức Chi said...,0.1067,0.4517,-0.0258,-0.9995
7,VN's economy forecast to grow by 5.5 per cent ...,2024-04-24,"After experiencing downturns in 2023, Việt Nam...",0.25,0.3333,0.0,0.9999
8,Việt Nam becomes a fastest growing digital eco...,2024-04-23,Việt Nam has been the fastest growing digital ...,-0.05,0.2,0.6808,0.999
9,FDI flow into garment and textile sector bounc...,2024-04-22,The flow of foreign direct investment (FDI) in...,0.1958,0.4292,0.6124,1.0


In [26]:
import pandas as pd
from tqdm import tqdm

# Define file paths for the files
file_paths = {
    'e1vfvn30': 'E1VFVN30 ETF Price History.csv',
    'etf': 'ETF Stock Price History.csv',
    'fueip100': 'FUEIP100 ETF Price History.csv',
    'fuemav30': 'FUEMAV30 ETF Price History.csv',
    'fuessv30': 'FUESSV30 Stock Price History.csv',
    'fuessv50': 'FUESSV50 Stock Price History.csv',
    'fuessvfl': 'FUESSVFL ETF Price History.csv',
    'fuevn100': 'FUEVN100 ETF Price History.csv',
    'sentiment_analysis': 'News_all_sentiments.csv'
}

# Function to load stock data
def load_stock_data(path, symbol):
    try:
        df = pd.read_csv(path)
        df['Date'] = pd.to_datetime(df['Date'])
        df['Symbol'] = symbol.upper()
        price_columns = ['Price', 'Open', 'High', 'Low']
        for column in price_columns:
            if column in df.columns:
                df[column] = df[column].str.replace(',', '').astype(float)
        return df
    except FileNotFoundError:
        print(f"File {path} không tồn tại, vui lòng kiểm tra đường dẫn.")
        return pd.DataFrame()

# Function to load sentiment data
def load_sentiment_data(path):
    try:
        df = pd.read_csv(path)
        df['Time'] = pd.to_datetime(df['Time'])
        daily_sentiment = df.groupby('Time').agg({
            'Nltk_Sentiment': lambda x: x.iloc[0] if not x.empty else None,
            'FinBERT_Sentiment': lambda x: x.iloc[0] if not x.empty else None,
            'Subjectivity_avg': lambda x: x.iloc[0] if not x.empty else None,
            'Polarity_avg': lambda x: x.iloc[0] if not x.empty else None
        }).reset_index()
        daily_sentiment.rename(columns={'Time': 'Date'}, inplace=True)
        return daily_sentiment
    except FileNotFoundError:
        print(f"File {path} không tồn tại, vui lòng kiểm tra đường dẫn.")
        return pd.DataFrame()

# Load and process stock data
data_frames = {}
for symbol, path in tqdm(file_paths.items(), desc="Loading data"):
    if symbol != 'sentiment_analysis':
        data_frames[symbol] = load_stock_data(path, symbol)
    else:
        data_frames[symbol] = load_sentiment_data(path)

# Combine all stock data
combined_stock_data = pd.concat([df for symbol, df in data_frames.items() if symbol != 'sentiment_analysis'])

# Merge stock data with sentiment analysis data on the 'Date' column
final_combined_data = pd.merge(combined_stock_data, data_frames['sentiment_analysis'], on='Date', how='inner')

# Save the processed data to a CSV file
processed_csv_path = 'processed_financial_sentiment_data_combined.csv'
final_combined_data.to_csv(processed_csv_path, index=False)

print(f"Kết quả phân tích đã được lưu vào {processed_csv_path}")


Loading data: 100%|██████████| 9/9 [00:00<00:00, 14.83it/s]

Kết quả phân tích đã được lưu vào processed_financial_sentiment_data_combined.csv





In [69]:
import pandas as pd
datav2 = pd.read_csv('processed_financial_sentiment_data_combined.csv')
datav2.head(10)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,Symbol,Nltk_Sentiment,FinBERT_Sentiment,Subjectivity_avg,Polarity_avg
0,2024-04-26,21550.0,21300.0,21550.0,21070.0,167.50K,2.38%,E1VFVN30,0.3352,0.0006,0.4333,0.1375
1,2024-04-26,30750.0,29500.0,30800.0,29500.0,6.56M,2.50%,ETF,0.3352,0.0006,0.4333,0.1375
2,2024-04-26,7790.0,7790.0,7790.0,7790.0,,-0.76%,FUEIP100,0.3352,0.0006,0.4333,0.1375
3,2024-04-26,14680.0,14640.0,14700.0,14560.0,13.70K,0.27%,FUEMAV30,0.3352,0.0006,0.4333,0.1375
4,2024-04-26,15310.0,15180.0,15500.0,15040.0,32.10K,0.79%,FUESSV30,0.3352,0.0006,0.4333,0.1375
5,2024-04-26,18670.0,18310.0,18700.0,18280.0,14.30K,-0.16%,FUESSV50,0.3352,0.0006,0.4333,0.1375
6,2024-04-26,20050.0,20370.0,20370.0,19850.0,249.90K,0.80%,FUESSVFL,0.3352,0.0006,0.4333,0.1375
7,2024-04-26,16520.0,17550.0,17550.0,16360.0,60.00K,0.67%,FUEVN100,0.3352,0.0006,0.4333,0.1375
8,2024-04-25,21050.0,21300.0,21300.0,20100.0,792.60K,-1.03%,E1VFVN30,-0.0258,-0.9995,0.4517,0.1067
9,2024-04-25,30000.0,30290.0,30290.0,29780.0,5.26M,1.01%,ETF,-0.0258,-0.9995,0.4517,0.1067


In [70]:
import pandas as pd

# Đọc dữ liệu từ file CSV
datasp500 = pd.read_csv('/content/S&P 500 Historical Data.csv')

# Chuyển đổi cột 'Date' sang kiểu dữ liệu datetime
datasp500['Date'] = pd.to_datetime(datasp500['Date'])

# Định dạng lại cột 'Date' thành 'YYYY-MM-DD'
datasp500['Date'] = datasp500['Date'].dt.strftime('%Y-%m-%d')

# Chuyển đổi các cột 'Price', 'Open', 'High', 'Low' sang dạng float và chuyển đổi sang đơn vị tiền Việt
price_columns = ['Price', 'Open', 'High', 'Low']
ty_gia_vnd = 25.441 # Tỷ giá chuyển đổi từ đô la sang tiền Việt
for column in price_columns:
    if column in datasp500.columns:
        datasp500[column] = datasp500[column].str.replace(',', '').astype(float) * ty_gia_vnd

# Loại bỏ cột 'Vol.'
datasp500 = datasp500.drop(columns=['Vol.'])

# Hiển thị 5 dòng đầu tiên của DataFrame để kiểm tra
print(datasp500.head())

# Lưu dữ liệu đã xử lý vào một tệp CSV
processed_csv_path = '/content/processed_sp500_data.csv'
datasp500.to_csv(processed_csv_path, index=False)

print(f"Dữ liệu S&P 500 đã được lưu vào {processed_csv_path}")


         Date         Price          Open          High           Low Change %
0  2024-04-30  128112.98929  129845.26698  130024.62603  128103.32171   -1.57%
1  2024-04-29  130160.48097  130108.58133  130346.70909  129460.34465    0.32%
2  2024-04-26  129748.08236  129358.58065  130121.04742  129065.75474    1.02%
3  2024-04-25  128436.85322  127710.76708  128674.21775  126965.34578   -0.46%
4  2024-04-24  129027.33883  129363.92326  129481.46068  128401.23582    0.02%
Dữ liệu S&P 500 đã được lưu vào /content/processed_sp500_data.csv


In [71]:
# Đổi tên các cột dữ liệu từ DataFrame datasp500
datasp500_renamed = datasp500.rename(columns={
    'Price': 'PriceSP500',
    'Open': 'OpenSP500',
    'High': 'HighSP500',
    'Low': 'LowSP500',
    'Change %': 'Change%SP500'
})
# Nối hai DataFrame dựa trên cột 'Date'
finnaldata = pd.merge(datav2, datasp500_renamed, on='Date', how='inner')
finnaldata_path = 'finaldata_combined.csv'
finnaldata.to_csv(finnaldata_path, index=False)

print(f"Dữ liệu đã được lưu vào tệp {finnaldata_path}")
# In ra các dòng đầu tiên để kiểm tra kết quả
finnaldata.head(5)


Dữ liệu đã được lưu vào tệp finaldata_combined.csv


Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,Symbol,Nltk_Sentiment,FinBERT_Sentiment,Subjectivity_avg,Polarity_avg,PriceSP500,OpenSP500,HighSP500,LowSP500,Change%SP500
0,2024-04-26,21550.0,21300.0,21550.0,21070.0,167.50K,2.38%,E1VFVN30,0.3352,0.0006,0.4333,0.1375,129748.08236,129358.58065,130121.04742,129065.75474,1.02%
1,2024-04-26,30750.0,29500.0,30800.0,29500.0,6.56M,2.50%,ETF,0.3352,0.0006,0.4333,0.1375,129748.08236,129358.58065,130121.04742,129065.75474,1.02%
2,2024-04-26,7790.0,7790.0,7790.0,7790.0,,-0.76%,FUEIP100,0.3352,0.0006,0.4333,0.1375,129748.08236,129358.58065,130121.04742,129065.75474,1.02%
3,2024-04-26,14680.0,14640.0,14700.0,14560.0,13.70K,0.27%,FUEMAV30,0.3352,0.0006,0.4333,0.1375,129748.08236,129358.58065,130121.04742,129065.75474,1.02%
4,2024-04-26,15310.0,15180.0,15500.0,15040.0,32.10K,0.79%,FUESSV30,0.3352,0.0006,0.4333,0.1375,129748.08236,129358.58065,130121.04742,129065.75474,1.02%
