# Part 1 - Question 2
Please make sure to upload the dataset "SemEval2017 Task4_Sentiment_Analysis.csv" before running the code.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load dataset
df = pd.read_csv("SemEval2017 Task4_ Sentiment_Analysis.csv")

# Encode for classification
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

# Map for regression
sentiment_mapping = {"positive": 1, "neutral": 0, "negative": -1}
df["label_regression"] = df["label"].map(sentiment_mapping)

# Train/test split
X_train, X_test, y_train_cls, y_test_cls = train_test_split(df["text"], df["label_encoded"], test_size=0.2, random_state=42)
_, _, y_train_reg, y_test_reg = train_test_split(df["text"], df["label_regression"], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Classification model
clf = LogisticRegression(max_iter=1000)
y_pred_cls = clf.fit(X_train_tfidf, y_train_cls).predict(X_test_tfidf)
accuracy = accuracy_score(y_test_cls, y_pred_cls)

# Regression model
reg = Ridge()
y_pred_reg = reg.fit(X_train_tfidf, y_train_reg).predict(X_test_tfidf)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

# Output results
print(f"Classification Accuracy: {accuracy:.3f}")
print(f"Regression RMSE: {rmse:.3f}")


Classification Accuracy: 0.668
Regression RMSE: 0.558


# Part 2
Please make sure to upload the dataset "20_Newsgroups.csv" before running the code

In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.sparse import hstack

# === Load and Clean Data ===
df = pd.read_csv("20_Newsgroups.csv")

# Remove empty rows
df = df.dropna(subset=["text", "label"])
df = df[df["text"].str.strip() != ""]
df = df[df["label"].str.strip() != ""]

# Keep only labels: class-0 to class-5
valid_labels = {f'class-{i}' for i in range(6)}
df = df[df["label"].isin(valid_labels)].copy()

# Clean text: lowercase, remove emails, URLs, digits, symbols
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

# Encode labels to integers
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

# === Prepare Text and Labels ===
X_text = df['clean_text']
y = df['label_enc']

# Split: train/dev/test = 60/20/20
X_temp, X_test, y_temp, y_test = train_test_split(X_text, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

# === Feature 1: TF-IDF ===
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_dev_tfidf = tfidf.transform(X_dev)
X_test_tfidf = tfidf.transform(X_test)

# === Feature 2–5: Custom Numerical Features ===
def avg_word_length(texts):
    return np.array([[np.mean([len(w) for w in t.split()]) if t.split() else 0] for t in texts])

def digit_count(texts):
    return np.array([[sum(c.isdigit() for c in t)] for t in texts])

def unique_word_count(texts):
    return np.array([[len(set(t.split()))] for t in texts])

def capital_letter_count(texts):
    return np.array([[sum(c.isupper() for c in t)] for t in texts])

# Compute features
avg_train = avg_word_length(X_train)
avg_dev = avg_word_length(X_dev)
avg_test = avg_word_length(X_test)

digit_train = digit_count(X_train)
digit_dev = digit_count(X_dev)
digit_test = digit_count(X_test)

uniq_train = unique_word_count(X_train)
uniq_dev = unique_word_count(X_dev)
uniq_test = unique_word_count(X_test)

caps_train = capital_letter_count(X_train)
caps_dev = capital_letter_count(X_dev)
caps_test = capital_letter_count(X_test)

# === Combine All Features ===
X_train_combined = hstack([X_train_tfidf, avg_train, digit_train, uniq_train, caps_train])
X_dev_combined = hstack([X_dev_tfidf, avg_dev, digit_dev, uniq_dev, caps_dev])
X_test_combined = hstack([X_test_tfidf, avg_test, digit_test, uniq_test, caps_test])

# === Train and Evaluate Model ===
clf = LogisticRegression(max_iter=3000)
clf.fit(X_train_combined, y_train)

y_pred = clf.predict(X_test_combined)

acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

# === Output Results ===
print(f"Accuracy: {acc:.3f}")
print(f"Macro Precision: {prec:.3f}")
print(f"Macro Recall: {rec:.3f}")
print(f"Macro F1: {f1:.3f}")

Accuracy: 0.848
Macro Precision: 0.856
Macro Recall: 0.852
Macro F1: 0.853
