In [60]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [61]:
df = pd.read_csv("/kaggle/input/twitter-tweets-sentiment-dataset/Tweets.csv")
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [62]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [63]:
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www.\S+", "", t)
    t = re.sub(r"@[A-Za-z0-9_]+", "", t)
    t = re.sub(r"[^a-z\s]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["text"] = df["text"].apply(clean_text)
df["sentiment"] = df["sentiment"].map({"negative": 0,"neutral": 1,"positive": 2})
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,"I`d have responded, if I were going",1
1,549e992a42,sooo sad i will miss you here in san diego,Sooo SAD,0
2,088c60f138,my boss is bullying me,bullying me,0
3,9642c003ef,what interview leave me alone,leave me alone,0
4,358bd9e861,sons of why couldnt they put them on the relea...,"Sons of ****,",0
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband los...,d lost,0
27477,4f4c4fc327,ive wondered about rake to the client has made...,", don`t force",0
27478,f67aae2310,yay good for both of you enjoy the break you p...,Yay good for both of you.,2
27479,ed167662a5,but it was worth it,But it was worth it ****.,2


In [64]:
df = df[['text', 'sentiment']]
df

Unnamed: 0,text,sentiment
0,id have responded if i were going,1
1,sooo sad i will miss you here in san diego,0
2,my boss is bullying me,0
3,what interview leave me alone,0
4,sons of why couldnt they put them on the relea...,0
...,...,...
27476,wish we could come see u on denver husband los...,0
27477,ive wondered about rake to the client has made...,0
27478,yay good for both of you enjoy the break you p...,2
27479,but it was worth it,2


In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["sentiment"], test_size=0.2, random_state=42
)

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [66]:
svm = LinearSVC()
svm.fit(X_train_vec, y_train)

# RESULT

In [67]:
pred = svm.predict(X_test_vec)

accuracy = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test, pred)
report = classification_report(y_test, pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

Accuracy: 0.6641193595342066

Confusion Matrix:
 [[ 920  529  123]
 [ 376 1515  345]
 [  87  386 1215]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.59      0.62      1572
           1       0.62      0.68      0.65      2236
           2       0.72      0.72      0.72      1688

    accuracy                           0.66      5496
   macro avg       0.67      0.66      0.66      5496
weighted avg       0.67      0.66      0.66      5496

