In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
import time

In [2]:
df=pd.read_csv("training.1600000.processed.noemoticon.csv",encoding="ISO-8859-1",names=['target','id','Date','Flag','User','Tweet'])

In [3]:
df

Unnamed: 0,target,id,Date,Flag,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
# Step 2: Data Preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, remove_user_mentions=True, remove_urls=True, stopwords_removal=True):
        self.remove_user_mentions = remove_user_mentions
        self.remove_urls = remove_urls
        self.stopwords_removal = stopwords_removal
        self.stopwords = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        preprocessed_text = []
        for text in X:
            if self.remove_user_mentions:
                text = re.sub(r'@\w+', '', text)
            if self.remove_urls:
                text = re.sub(r'http\S+|www\S+', '', text)
            text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
            text = text.lower().strip()

            if self.stopwords_removal:
                tokens = text.split()
                tokens = [token for token in tokens if token not in self.stopwords]
                text = ' '.join(tokens)

            preprocessed_text.append(text)

        return preprocessed_text

In [5]:
# Step 3: Replace 4 with 1 in the target column
df['target'] = df['target'].replace(4, 1)

In [6]:
# Step 4: Split the dataset into train and test sets
X = df['Tweet']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Step 5: Build and train the models
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'ANN': MLPClassifier(),
    'SVC': SVC(kernel='linear')
}

In [8]:
best_models = {}

for model_name, model in models.items():
    # Create a pipeline with TextPreprocessor, TfidfVectorizer, and the current model as steps
    pipe = Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(lowercase=False)),
        ('classifier', model)
    ])

    # Start the timer
    start_time = time.time()

    # Fit the pipeline to the training data
    pipe.fit(X_train, y_train)

    # Calculate the training time
    training_time = time.time() - start_time

    # Store the trained model and training time in the dictionary of best models
    best_models[model_name] = {'model': pipe, 'training_time': training_time}

In [9]:
training_time

46.90131449699402

In [10]:
print(f"=== {model} ===")
print("Train Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("====================")

=== Pipeline(steps=[('preprocessor', TextPreprocessor()),
                ('vectorizer', TfidfVectorizer(lowercase=False)),
                ('classifier', MultinomialNB())]) ===
Train Accuracy: 0.8105515625
Test Accuracy: 0.76495
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77    159494
           4       0.78      0.74      0.76    160506

    accuracy                           0.76    320000
   macro avg       0.77      0.77      0.76    320000
weighted avg       0.77      0.76      0.76    320000

Confusion Matrix:
 [[125396  34098]
 [ 41118 119388]]


In [None]:
print(f"=== {model} ===")
print("Train Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("====================")