In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset (change path if necessary)
columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1', names=columns)

# Keep only necessary columns
df = df[['target', 'text']]
df['target'] = df['target'].replace({0: 'negative', 4: 'positive'})

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    stop_words = set("a an the and or for with is in on to of at it this that by from as are was were be been has had have".split())
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['target'], test_size=0.1, random_state=42)

# Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train classifier
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Visualization: Sentiment distribution
plt.figure(figsize=(6,4))
sns.countplot(x='target', data=df, palette='coolwarm')
plt.title('Sentiment Distribution')
plt.show()

# Word cloud for positive and negative words
positive_words = ' '.join(df[df['target'] == 'positive']['cleaned_text'])
negative_words = ' '.join(df[df['target'] == 'negative']['cleaned_text'])

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.title('Positive Tweets Word Cloud')
wordcloud = WordCloud(width=400, height=300, background_color='white').generate(positive_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

plt.subplot(1,2,2)
plt.title('Negative Tweets Word Cloud')
wordcloud = WordCloud(width=400, height=300, background_color='black').generate(negative_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

plt.show()