In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
twitter_data = pd.read_csv("/content/twitter_training.csv")
news_data = pd.read_csv("/content/news.csv")

# Display column names to verify structure
print("Twitter Data Columns:", twitter_data.columns)
print("News Data Columns:", news_data.columns)

# Rename columns if necessary
twitter_data.columns = ['ID', 'Brand', 'Sentiment', 'Phrase']
news_data.columns = ['TimeStamp', 'Headline']

# Ensure required columns exist
required_columns = {'ID', 'Brand', 'Sentiment', 'Phrase'}
if not required_columns.issubset(twitter_data.columns):
    raise KeyError(f"Missing required columns in Twitter data: {required_columns - set(twitter_data.columns)}")

if 'Headline' not in news_data.columns:
    raise KeyError("Missing required column: 'Headline' in News data")

# Text Preprocessing
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = text.replace("[^a-zA-Z]", " ")
        return text
    return ""

twitter_data['Phrase'] = twitter_data['Phrase'].astype(str).apply(preprocess_text)
news_data['Headline'] = news_data['Headline'].astype(str).apply(preprocess_text)

# Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_twitter = tfidf_vectorizer.fit_transform(twitter_data['Phrase'])
y_twitter = twitter_data['Sentiment']
X_news = tfidf_vectorizer.transform(news_data['Headline'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_twitter, y_twitter, test_size=0.2, random_state=42)

# Train Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict Sentiments for News Headlines
news_sentiments = model.predict(X_news)
news_data['Predicted Sentiment'] = news_sentiments

# Save results
news_data.to_csv("news_with_sentiments.csv", index=False)
print("Sentiment predictions saved to news_with_sentiments.csv")

Twitter Data Columns: Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')
News Data Columns: Index(['publish_date', 'headline'], dtype='object')
Model Accuracy: 0.6443730334069759
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.73      0.36      0.48      2661
    Negative       0.63      0.81      0.71      4471
     Neutral       0.65      0.55      0.60      3551
    Positive       0.63      0.73      0.68      4254

    accuracy                           0.64     14937
   macro avg       0.66      0.61      0.62     14937
weighted avg       0.65      0.64      0.63     14937

Sentiment predictions saved to news_with_sentiments.csv
