In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import seaborn as sns 
import warnings

In [2]:
# Ignore warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('twitter_training.csv')

In [4]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df.columns = ['ID', 'Platform','Review','text']

In [6]:
df.shape

(74681, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        74681 non-null  int64 
 1   Platform  74681 non-null  object
 2   Review    74681 non-null  object
 3   text      73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [8]:
df.isnull().sum()

ID            0
Platform      0
Review        0
text        686
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.duplicated().sum()

2340

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.drop(columns=['ID', 'Platform'], inplace=True)

In [13]:
df.head()

Unnamed: 0,Review,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [14]:
import re
def clean_tweet(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove mentions (@username)
    text = re.sub(r'\@\w+', '', text)
    
    # Remove hashtags (keeping the word by removing only '#')
    text = re.sub(r'#', '', text)
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r"[^A-Za-z\s]", '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [15]:
df['text'] = df['text'].apply(clean_tweet)

In [16]:
df.head()

Unnamed: 0,Review,text
0,Positive,i am coming to the borders and i will kill you...
1,Positive,im getting on borderlands and i will kill you all
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands and i will murder yo...
4,Positive,im getting into borderlands and i can murder y...


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# Fit and transform the TF-IDF vectorizer on the cleaned text
X = vectorizer.fit_transform(df['text'])

# Convert to a DataFrame (optional, for viewing)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first few rows of the TF-IDF feature matrix
print(tfidf_df.head())

# Display the shape of the TF-IDF matrix
print(X.shape)

    aa  aaa  aaron   ab  abandon  abilities  ability  able  absolute  \
0  0.0  0.0    0.0  0.0      0.0        0.0      0.0   0.0       0.0   
1  0.0  0.0    0.0  0.0      0.0        0.0      0.0   0.0       0.0   
2  0.0  0.0    0.0  0.0      0.0        0.0      0.0   0.0       0.0   
3  0.0  0.0    0.0  0.0      0.0        0.0      0.0   0.0       0.0   
4  0.0  0.0    0.0  0.0      0.0        0.0      0.0   0.0       0.0   

   absolutely  ...  zen  zero  zip  zoe  zombie  zombies  zone  zonestreamcx  \
0         0.0  ...  0.0   0.0  0.0  0.0     0.0      0.0   0.0           0.0   
1         0.0  ...  0.0   0.0  0.0  0.0     0.0      0.0   0.0           0.0   
2         0.0  ...  0.0   0.0  0.0  0.0     0.0      0.0   0.0           0.0   
3         0.0  ...  0.0   0.0  0.0  0.0     0.0      0.0   0.0           0.0   
4         0.0  ...  0.0   0.0  0.0  0.0     0.0      0.0   0.0           0.0   

   zoom  zuckerberg  
0   0.0         0.0  
1   0.0         0.0  
2   0.0         0.0 

In [18]:
from sklearn.model_selection import train_test_split

# Features (TF-IDF features from the 'text' column)
X = vectorizer.fit_transform(df['text'])

# Labels (from the 'Review' column)
y = df['Review']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the split datasets
print(X_train.shape)
print(X_test.shape)

(57324, 5000)
(14331, 5000)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("LogisticRegression Accuracy: ", accuracy)

# Display the classification report for detailed metrics
print(classification_report(y_test, y_pred))


LogisticRegression Accuracy:  0.6790175144791013
              precision    recall  f1-score   support

  Irrelevant       0.67      0.51      0.58      2455
    Negative       0.69      0.79      0.74      4433
     Neutral       0.67      0.61      0.64      3532
    Positive       0.67      0.72      0.69      3911

    accuracy                           0.68     14331
   macro avg       0.68      0.66      0.66     14331
weighted avg       0.68      0.68      0.68     14331



In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Train the model on the training data
nb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model's performance
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy: ", accuracy_nb)

# Display the classification report for Naive Bayes
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy:  0.6359639941385807
Naive Bayes Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.71      0.36      0.48      2455
    Negative       0.63      0.80      0.70      4433
     Neutral       0.67      0.53      0.59      3532
    Positive       0.60      0.72      0.66      3911

    accuracy                           0.64     14331
   macro avg       0.65      0.60      0.61     14331
weighted avg       0.65      0.64      0.62     14331



In [21]:
from sklearn.svm import SVC

# Initialize the SVM model with default parameters
svm_model = SVC()

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model's performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy: ", accuracy_svm)

# Display the classification report for SVM
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy:  0.8760030702672528
SVM Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.91      0.79      0.85      2455
    Negative       0.91      0.90      0.91      4433
     Neutral       0.89      0.86      0.87      3532
    Positive       0.82      0.91      0.86      3911

    accuracy                           0.88     14331
   macro avg       0.88      0.87      0.87     14331
weighted avg       0.88      0.88      0.88     14331

