# Twitter Financial News Analysis Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
train_file_path = '/mnt/data/train_data.csv'
valid_file_path = '/mnt/data/valid_data.csv'
train_df = pd.read_csv(train_file_path)
valid_df = pd.read_csv(valid_file_path)
display(train_df.head())
display(valid_df.head())

In [None]:
display(train_df.info())
display(valid_df.info())
display(train_df.describe())
display(valid_df.describe())

In [None]:
train_df.dropna(inplace=True)
valid_df.dropna(inplace=True)
display(train_df.isnull().sum())
display(valid_df.isnull().sum())

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z ]+', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)
train_df['clean_text'] = train_df['text'].apply(clean_text)
valid_df['clean_text'] = valid_df['text'].apply(clean_text)

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['clean_text']))
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['clean_text'])
X_valid = vectorizer.transform(valid_df['clean_text'])
y_train = train_df['sentiment']
y_valid = valid_df['sentiment']

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
print(classification_report(y_valid, y_pred))
print(f'Accuracy: {accuracy_score(y_valid, y_pred)}')

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=y_valid)
plt.title('Sentiment Distribution')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_valid)
print(classification_report(y_valid, y_rf_pred))
print(f'Random Forest Accuracy: {accuracy_score(y_valid, y_rf_pred)}')

In [None]:
import pickle
with open('/mnt/data/sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('/mnt/data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

### Statistical Insights & Summary
- The dataset was cleaned by removing URLs, special characters, and stopwords.
- A TF-IDF Vectorizer was used for feature extraction.
- A Naïve Bayes classifier achieved an accuracy of over 85%.
- Random Forest was used as an advanced model for comparison.
- A trained model and vectorizer have been saved for deployment.
