<a href="https://colab.research.google.com/github/NajouaMardi/sentiment-analysis/blob/master/app/Voting_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer

In [None]:
df = pd.read_csv('processed_reviews_100k_cleaned.csv')
df.head()

In [None]:
print("shape ",df.shape)
print("checking if data is missing",df.isnull().sum())
print("data types of columns",df.dtypes)

In [None]:
#Summary statistics for numeric columns
df.describe()

In [None]:
# Check the value counts of relevant categorical columns (e.g., "rating", "helpfulness")
df['rating'].value_counts()

In [None]:
rating_counts = df["rating"].value_counts().sort_index()
rating_percentages = (rating_counts / rating_counts.sum()) * 100

# Display nicely
rating_distribution = pd.DataFrame({
    "count": rating_counts,
    "percentage": rating_percentages.round(2)
})

print(rating_distribution)

In [None]:
# If the dataset has a rating column or sentiment labels, explore their distribution
df['rating'].value_counts().plot(kind='bar', title='Distribution of Ratings')

In [None]:
sentiment_counts = df["sentiment"].value_counts()
sentiment_percentages = (sentiment_counts / len(df)) * 100
sentiment_summary = pd.DataFrame({
    "count": sentiment_counts,
    "percentage": sentiment_percentages.round(2)
})
print(sentiment_summary)

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#stemmer = PorterStemmer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word not in stop_words]
    # Stemming
    #stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply to full_text (title+text)
df['clean_full_text'] = df['full_text'].apply(preprocess)


In [None]:
X = df['clean_full_text']
y = df['sentiment']

In [None]:


# Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers with fixed parameters (can be tuned indirectly)
nb = MultinomialNB(alpha=0.1)
svm = LinearSVC(class_weight='balanced', max_iter=3000)
lr = LogisticRegression(solver='saga', class_weight='balanced', max_iter=3000)

# Voting ensemble
voting = VotingClassifier(estimators=[
    ('nb', nb),
    ('svm', svm),
    ('lr', lr)
], voting='hard')  # or 'soft' if using models that support predict_proba

# Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('chi2', SelectKBest(score_func=chi2)),
    ('vote', voting)
])

# Param grid (mostly on preprocessing, not on base classifiers directly)
param_grid = {
    'tfidf__max_features': [4000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'chi2__k': [2000, 3000],
    # Optional tuning for LogisticRegression
    'vote__lr__C': [0.1, 1, 10],
    # Optional tuning for SVM
    'vote__svm__C': [0.1, 1, 10],
    # Optional tuning for NB
    'vote__nb__alpha': [0.01, 0.1, 1.0],
}

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearch
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# Evaluation
y_pred = grid.predict(X_test)

print("Best parameters:")
for k, v in grid.best_params_.items():
    print(f"{k}: {v}")
print("Best CV Accuracy:", grid.best_score_)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1 Macro:", f1_score(y_test, y_pred, average='macro'))
print("Test F1 Weighted:", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
