1. Read data

In [48]:
import pandas as pd

train_df = pd.read_csv("../train_test_data/train.csv", encoding="ISO-8859-1")
test_df = pd.read_csv("../train_test_data/test.csv", encoding="ISO-8859-1")

train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [49]:
train_df = train_df[['text', 'sentiment']]
test_df = test_df[['text', 'sentiment']]

train_df['sentiment'] = train_df['sentiment'].astype(str)
test_df['sentiment'] = test_df['sentiment'].astype(str)

train_df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [50]:
print("Label distribution in train set:")
print(train_df['sentiment'].value_counts())

print("\nLabel distribution in test set:")
print(test_df['sentiment'].value_counts())

Label distribution in train set:
sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

Label distribution in test set:
sentiment
neutral     1430
nan         1281
positive    1103
negative    1001
Name: count, dtype: int64


2. Clean data

In [51]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [52]:
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

train_df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,"I`d have responded, if I were going",id have responded if i were going
1,Sooo SAD I will miss you here in San Diego!!!,sooo sad i will miss you here in san diego
2,my boss is bullying me...,my boss is bullying me
3,what interview! leave me alone,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",sons of why couldnt they put them on the relea...


3. Converting text to numerical features

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)


# Fit on training text and transform both train & test
X_train = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])

# Target labels
y_train = train_df['sentiment']
y_test = test_df['sentiment']

4. Train model

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Initialize individual classifiers
lr = LogisticRegression(max_iter=1000)
svm = SVC(probability=True)
rf = RandomForestClassifier(n_estimators=100)

# Combine them in a soft Voting Classifier (averages probabilities)
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('svm', svm), ('rf', rf)],
    voting='soft'
)

# Train the ensemble
voting_clf.fit(X_train, y_train)

In [62]:
from sklearn.pipeline import Pipeline

# Assume `vectorizer` is your fitted TfidfVectorizer and `voting_clf` is your fitted VotingClassifier
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', voting_clf)
])

5. Test model

In [68]:
from sklearn.metrics import classification_report, accuracy_score

# Predict sentiment labels for the test set
y_pred = voting_clf.predict(X_test)

# Evaluate the model's performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

         nan       0.00      0.00      0.00      1281
    negative       0.75      0.63      0.68      1001
     neutral       0.37      0.78      0.50      1430
    positive       0.82      0.72      0.77      1103

    accuracy                           0.53      4815
   macro avg       0.48      0.53      0.49      4815
weighted avg       0.45      0.53      0.47      4815

Accuracy: 0.5262720664589824


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [64]:
sample_text = "I love this product! It's amazing."
prediction = pipeline.predict([sample_text])
print("Predicted sentiment:", prediction[0])

Predicted sentiment: positive


6. Save the model

In [66]:
import pickle

with open('self_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("Model saved to self_model.pkl")

Model saved to self_model.pkl
