# Kaggle Dataset https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [1]:
import pandas as pd
df=pd.read_csv('data/twitter_training.csv')

In [2]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
df=df.iloc[:,2:]
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.rename(columns={'im getting on borderlands and i will murder you all ,': 'text','Positive':'sentiment'}, inplace=True)
df = df[df['sentiment'] != 'Irrelevant']

In [4]:
df.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


## Cleaning

In [5]:
import re
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www.\S+", "", t)
    t = re.sub(r"@[A-Za-z0-9_]+", "", t)
    t = re.sub(r"[^a-z\s]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["text"] = df["text"].apply(clean_text)
df

Unnamed: 0,sentiment,text
0,Positive,i am coming to the borders and i will kill you...
1,Positive,im getting on borderlands and i will kill you all
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands and i will murder yo...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,just realized that the windows partition of my...
74677,Positive,just realized that my mac window partition is ...
74678,Positive,just realized the windows partition of my mac ...
74679,Positive,just realized between the windows partition of...


In [6]:
df['sentiment'].value_counts()

sentiment
Negative    21237
Positive    19137
Neutral     17110
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['sentiment']=le.fit_transform(df['sentiment'])


In [8]:
df.sample(10)

Unnamed: 0,sentiment,text
36958,2,awesome news such a big fan of this
73312,1,game make a k flash card from your current in ...
22285,2,i played csgo tonight i miss grinding the game...
52423,1,of course feel the beauty of the landscape
62502,0,rockstar makes so much money with shark cards ...
28599,1,happy happy halloween
7554,2,i know its a joke but that actually makes me s...
66532,2,by combining learn under bufflyfcqupt ex
63556,0,of lover fuck you fuck lover fuck love fuck lo...
59779,0,people believe everything they say publish


In [9]:
x=df.iloc[:,1]
y=df.iloc[:,0]

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=10)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)  
x_test_vec = vectorizer.transform(x_test) 
x_train_vec.shape

(43113, 29293)

# Result

In [12]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=0.5)
svm.fit(x_train_vec, y_train)
y_pred=svm.predict(x_test_vec)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8611091782061095

Confusion Matrix:
 [[4727  271  306]
 [ 429 3479  395]
 [ 348  247 4169]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.87      5304
           1       0.87      0.81      0.84      4303
           2       0.86      0.88      0.87      4764

    accuracy                           0.86     14371
   macro avg       0.86      0.86      0.86     14371
weighted avg       0.86      0.86      0.86     14371

