In [1]:
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

In [2]:
sentiment_data = pd.read_csv('twitter_training.csv')
sentiment_data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
sentiment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [4]:
sentiment_data.describe(include = 'all')

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
count,74681.0,74681,74681,73995.0
unique,,32,4,69490.0
top,,TomClancysRainbowSix,Negative,
freq,,2400,22542,172.0
mean,6432.640149,,,
std,3740.423819,,,
min,1.0,,,
25%,3195.0,,,
50%,6422.0,,,
75%,9601.0,,,


# Only going to work on sentiment column and sentences column

In [5]:
df = sentiment_data.iloc[:, [2, 3]]

In [6]:
df.head()

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,"
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [7]:
df.columns

Index(['Positive', 'im getting on borderlands and i will murder you all ,'], dtype='object')

In [8]:
df.rename(columns = {'Positive' : 'sentiments', 'im getting on borderlands and i will murder you all ,' : 'sentences'},
          inplace = True)

In [9]:
df.head()

Unnamed: 0,sentiments,sentences
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [10]:
df.sentiments.value_counts()

sentiments
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

# checking null values

In [11]:
df.isnull().sum()

sentiments      0
sentences     686
dtype: int64

In [12]:
df.isnull().sum()['sentences']

686

In [13]:
(df.isnull().sum()['sentences'] / len(df['sentences'])) * 100

0.9185736666622032

# As we can see there is less than 1% data missing so decided to remove them

In [14]:
df.dropna(inplace = True)

In [15]:
df.isnull().sum()

sentiments    0
sentences     0
dtype: int64

In [16]:
len(df['sentiments']), len(df['sentences'])

(73995, 73995)

# Checking duplicate values

In [17]:
df.duplicated().sum()

4227

In [18]:
df[df.duplicated()]['sentiments'].value_counts()

sentiments
Positive      1517
Negative      1121
Neutral        998
Irrelevant     591
Name: count, dtype: int64

In [19]:
# df.drop_duplicates() to remove rows
# subset Argument: You can specify which columns to consider when identifying duplicates.
# df.drop_duplicates(subset=['col'])

# Taking only 10000 reviews because of low-end laptop (will check solution at the end)

In [20]:
new_df = df.iloc[:10000, :]

In [21]:
new_df.shape

(10000, 2)

In [22]:
new_df['sentiments'].value_counts()

sentiments
Positive      3230
Neutral       2601
Negative      2306
Irrelevant    1863
Name: count, dtype: int64

# Start analysing

In [23]:
sentiments = ['Irrelevant', 'Positive', 'Negative', 'Neutral']

for i in range(len(sentiments)):
    print(sentiments[i])
    print('-'*100)
    
    for j in range(100, 1000, 100):
        print(new_df[new_df['sentiments'] == sentiments[i]].iloc[j, 1].lower())
        print()
        print()

Irrelevant
----------------------------------------------------------------------------------------------------
making games is a brutally difficult family business. without the team, you have done nothing. our industry itself is already very brutally badly underpaid for what little we do when vs others. this is shameful.


algeria: women in cross-border operations


nice little team wipe by @averyjeanttv with the one tap!. clips.twitch.tv/roundsucculent…. .  .  


and youtube. com / google watch? v = y - yof8 … we find life


ah, nothing is more irritating than 3 different letters in the title.


you excited my brother ? @_ashleysm1th


excuse god?.


congrats @dallasempire on the 2020 championship win! and who else won - @callofduty


it is not the first time that the eu commission has taken such a step.


Positive
----------------------------------------------------------------------------------------------------
<unk> 3 is good so far but guardian 2 will always be the fav


loving 

# Text cleaning and preprocessing

### What to remove ?
* Punctuations
* links
* need to lower the words
* @ and word associated with it

In [24]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
# print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/pratik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/pratik/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/pratik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:


from nltk.corpus import stopwords


 
# word_tokens = word_tokenize("This is a farm. The very beautiful farm")
# # converts the words in word_tokens to lower case and then checks whether 
# #they are present in stop_words or not
# filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
# #with no lower case conversion
# filtered_sentence = []
 
# for w in word_tokens:
#     if w not in stop_words:
#         filtered_sentence.append(w)
 
# print(word_tokens)
# print(filtered_sentence)

In [33]:
stop_words = set(stopwords.words('english'))
import re
lemmatizer = WordNetLemmatizer()

def remove_stopwords(text):

    word_tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words]
    cleaned_text = " ".join(lemmatized_tokens)
    return cleaned_text

In [34]:
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(text):

    t = text.str.lower()

    # removes multiple spaces with single space
    t = t.str.replace(r'\s+', ' ', regex = True)

    # removes all punctuations
    t = t.str.replace(r'[^\w\s]', '', regex = True)

    # removes specific characters
    t = t.str.replace(r'[!@#\$%^&*()]', '', regex = True)

    cleaned_text = t.apply(remove_stopwords)

    # lemmatized_tokens = lemmatizer.lemmatize(t)
    # cleaned_text = " ".join(lemmatized_tokens)
    # return cleaned_text
    
    return cleaned_text

In [35]:
new_df['new_sentences'] = clean_text(new_df['sentences'])

In [36]:
new_df.head()

Unnamed: 0,sentiments,sentences,new_sentences
0,Positive,I am coming to the borders and I will kill you...,coming border kill
1,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
2,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
3,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder
4,Positive,im getting into borderlands and i can murder y...,im getting borderland murder


In [37]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [38]:
new_df['sentiments'] = le.fit_transform(new_df['sentiments'])

In [66]:
df.iloc[1000:2000]

Unnamed: 0,sentiments,sentences
1004,Negative,@EpicGames on @2K why add crossplay for @Steam...
1005,Negative,@EpicGames 4 @2K @Steam why not add crossplay ...
1006,Negative,made @EpicGames @2K why add crossplay for it 3...
1007,Irrelevant,Got to repaint another Nerf gun for the Tiny T...
1008,Irrelevant,I need to paint over another Nerf gun for the ...
...,...,...
2001,Irrelevant,Congratulations given to Borderlands Research ...
2002,Irrelevant,Congratulations to Borderlands Foundation Inte...
2003,Neutral,Borderlands 3. Firebug (Bronze). Sabotage all ...
2004,Neutral,Borderlands 3. Firebug (Bronze): Sabotage of a...


In [67]:
new_df.iloc[1000:2000]

Unnamed: 0,sentiments,sentences,new_sentences
1004,1,@EpicGames on @2K why add crossplay for @Steam...,epicgames 2k add crossplay steam 3 werent plan...
1005,1,@EpicGames 4 @2K @Steam why not add crossplay ...,epicgames 4 2k steam add crossplay borderland ...
1006,1,made @EpicGames @2K why add crossplay for it 3...,made epicgames 2k add crossplay 3 werent gon n...
1007,0,Got to repaint another Nerf gun for the Tiny T...,got repaint another nerf gun tiny tina cosplay...
1008,0,I need to paint over another Nerf gun for the ...,need paint another nerf gun tiny tina cosplay ...
...,...,...,...
2001,0,Congratulations given to Borderlands Research ...,congratulation given borderland research insti...
2002,0,Congratulations to Borderlands Foundation Inte...,congratulation borderland foundation internati...
2003,2,Borderlands 3. Firebug (Bronze). Sabotage all ...,borderland 3 firebug bronze sabotage statue ember
2004,2,Borderlands 3. Firebug (Bronze): Sabotage of a...,borderland 3 firebug bronze sabotage ember statue


In [39]:
new_df['sentiments'].unique()

array([3, 2, 1, 0])

In [None]:
# ['3 Positive', '1 Negative', '0 Irrelvant', '2 Neutral']

In [40]:
new_df.head()

Unnamed: 0,sentiments,sentences,new_sentences
0,3,I am coming to the borders and I will kill you...,coming border kill
1,3,im getting on borderlands and i will kill you ...,im getting borderland kill
2,3,im coming on borderlands and i will murder you...,im coming borderland murder
3,3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder
4,3,im getting into borderlands and i can murder y...,im getting borderland murder


In [41]:
final_df = new_df[['new_sentences', 'sentiments']]
final_df.head()

Unnamed: 0,new_sentences,sentiments
0,coming border kill,3
1,im getting borderland kill,3
2,im coming borderland murder,3
3,im getting borderland 2 murder,3
4,im getting borderland murder,3


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [43]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(final_df['new_sentences'], final_df['sentiments'], test_size=0.2, random_state=42)

# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [44]:
# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
# Make predictions
y_pred = model.predict(X_test_vec)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8695


In [47]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [48]:
gnb.fit(X_train_vec.toarray(), y_train)

In [49]:
# Make predictions
y_pred_gnb = gnb.predict(X_test_vec.toarray())

# Evaluate performance
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
print(f"Accuracy: {accuracy_gnb}")

Accuracy: 0.8075


In [50]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [72]:
# Train SVM classifier
svm_classifier = SVC(kernel = 'rbf', probability = True)
svm_classifier.fit(X_train_vec, y_train)

In [73]:
# Make predictions
y_pred_svm = svm_classifier.predict(X_test_vec)

# Evaluate performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm}")

Accuracy: 0.949


In [53]:
print('SVM Classification Report')
print(classification_report(y_test, y_pred_svm))

SVM Classification Report
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       374
           1       0.97      0.94      0.95       429
           2       0.95      0.95      0.95       492
           3       0.92      0.97      0.95       705

    accuracy                           0.95      2000
   macro avg       0.96      0.94      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [54]:
import pickle, joblib

In [117]:
# Save the model and vectorizer
with open('text_model.pkl', 'wb') as f:
    pickle.dump({'vectorizer': vectorizer, 'model': svm_classifier}, f)

In [118]:
# Load the model and vectorizer
with open('text_model.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

In [119]:
loaded_vectorizer = loaded_data['vectorizer']
loaded_model = loaded_data['model']

In [120]:
loaded_model_pred = loaded_model.predict(X_test_vec)

In [121]:
# Evaluate performance
accuracy_svm_loaded_model = accuracy_score(y_test, loaded_model_pred)
print(f"Accuracy: {accuracy_svm_loaded_model}")

Accuracy: 0.949


In [55]:
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(svm_classifier, "svm_classifier.joblib")

['svm_classifier.joblib']

In [59]:
# New text for prediction (single string)
new_text = "movies"

# Load and predict
loaded_vectorizer = joblib.load("tfidf_vectorizer.joblib")
loaded_classifier = joblib.load("svm_classifier.joblib")

X_new = loaded_vectorizer.transform([new_text]) # Important: Wrap in a list
prediction = loaded_classifier.predict(X_new)
print(f"Prediction for '{new_text}': {prediction}")

Prediction for 'movies': [3]


In [68]:
pred = ''

In [70]:
pred = pred + 'hello' 

In [71]:
pred

'hello'