Data Collection
===================

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/nethajinirmal13/Training-datasets/main/twitter_training.csv"
df=pd.read_csv(url, header=None)

# Display the current DataFrame to understand its structure
print("Current DataFrame:")
print(df.head())

# Manually assign column names based on your data structure
# Assuming your data format is like: id, col1, col2, col3, sentiment
# Replace with your actual column names
column_names = ['id', 'some_column_name', 'target', 'sentiment']

# Assign the column names to the DataFrame
df.columns = column_names

# Display the DataFrame with updated column names
print("\nDataFrame with Updated Column Names:")
print(df.head())

# Now, the first row contains your actual data and the column names are correctly assigned


Current DataFrame:
      0            1         2  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                   3  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  

DataFrame with Updated Column Names:
     id some_column_name    target  \
0  2401      Borderlands  Positive   
1  2401      Borderlands  Positive   
2  2401      Borderlands  Positive   
3  2401      Borderlands  Positive   
4  2401      Borderlands  Positive   

                                           sentiment  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting 

Data Cleaning
===============

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                74682 non-null  int64 
 1   some_column_name  74682 non-null  object
 2   target            74682 non-null  object
 3   sentiment         73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [3]:
df = df[df['sentiment'].notna()]

In [4]:
df['target'].value_counts()

target
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [5]:
mappings = {'Irrelevant': 'Neutral'}
df['target'] = df['target'].replace(mappings)

In [6]:
df['target'].value_counts()

target
Neutral     30983
Negative    22358
Positive    20655
Name: count, dtype: int64

In [7]:
df.drop(columns=["id","some_column_name"],inplace=True)

In [8]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/431.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m337.9/431.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


In [9]:
import unicodedata

def has_emoji(text_list):
    for text in text_list:
        for char in text:
            if char != ' ' and unicodedata.name(char).startswith('EMOJI'):
                return True
    return False

# Apply the function to the 'sentiment' column
df['has_emoji'] = df['sentiment'].apply(has_emoji)

df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,im getting on borderlands and i will murder yo...,False
1,Positive,I am coming to the borders and I will kill you...,False
2,Positive,im getting on borderlands and i will kill you ...,False
3,Positive,im coming on borderlands and i will murder you...,False
4,Positive,im getting on borderlands 2 and i will murder ...,False


In [10]:
df['has_emoji'].value_counts()

has_emoji
False    73994
True         2
Name: count, dtype: int64

In [11]:
df = df[df["has_emoji"] == False]

In [12]:
df['sentiment']=df['sentiment'].str.lower()
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,im getting on borderlands and i will murder yo...,False
1,Positive,i am coming to the borders and i will kill you...,False
2,Positive,im getting on borderlands and i will kill you ...,False
3,Positive,im coming on borderlands and i will murder you...,False
4,Positive,im getting on borderlands 2 and i will murder ...,False


In [13]:
def remove_whitespace(text):
    return  " ".join(text.split())


df['sentiment']=df['sentiment'].apply(remove_whitespace)
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,im getting on borderlands and i will murder yo...,False
1,Positive,i am coming to the borders and i will kill you...,False
2,Positive,im getting on borderlands and i will kill you ...,False
3,Positive,im coming on borderlands and i will murder you...,False
4,Positive,im getting on borderlands 2 and i will murder ...,False


In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
from nltk import word_tokenize
df['sentiment']=df['sentiment'].apply(lambda X: word_tokenize(X))
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,"[im, getting, on, borderlands, and, i, will, m...",False
1,Positive,"[i, am, coming, to, the, borders, and, i, will...",False
2,Positive,"[im, getting, on, borderlands, and, i, will, k...",False
3,Positive,"[im, coming, on, borderlands, and, i, will, mu...",False
4,Positive,"[im, getting, on, borderlands, 2, and, i, will...",False


In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')


def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords:
            result.append(token)

    return result


df['sentiment']=df['sentiment'].apply(remove_stopwords)
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,"[im, getting, borderlands, murder, ,]",False
1,Positive,"[coming, borders, kill, ,]",False
2,Positive,"[im, getting, borderlands, kill, ,]",False
3,Positive,"[im, coming, borderlands, murder, ,]",False
4,Positive,"[im, getting, borderlands, 2, murder, ,]",False


In [18]:
from nltk.tokenize import RegexpTokenizer

def remove_punct(text):

    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst


df['sentiment']=df['sentiment'].apply(remove_punct)
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,"[im, getting, borderlands, murder]",False
1,Positive,"[coming, borders, kill]",False
2,Positive,"[im, getting, borderlands, kill]",False
3,Positive,"[im, coming, borderlands, murder]",False
4,Positive,"[im, getting, borderlands, 2, murder]",False


In [19]:
def keep_alphabetical_only(sentiment_list):
    return [word for word in sentiment_list if word.isalpha()]

# Apply the function to the 'sentiment' column
df['sentiment'] = df['sentiment'].apply(keep_alphabetical_only)
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,"[im, getting, borderlands, murder]",False
1,Positive,"[coming, borders, kill]",False
2,Positive,"[im, getting, borderlands, kill]",False
3,Positive,"[im, coming, borderlands, murder]",False
4,Positive,"[im, getting, borderlands, murder]",False


In [20]:
def remove_im(sentiment_list):
    return [word for word in sentiment_list if word != 'im']

# Apply the function to the 'sentiment' column
df['sentiment'] = df['sentiment'].apply(remove_im)
df.head()

Unnamed: 0,target,sentiment,has_emoji
0,Positive,"[getting, borderlands, murder]",False
1,Positive,"[coming, borders, kill]",False
2,Positive,"[getting, borderlands, kill]",False
3,Positive,"[coming, borderlands, murder]",False
4,Positive,"[getting, borderlands, murder]",False


In [21]:
# Convert lists to tuples in the 'sentiment' column
df['sentiment'] = df['sentiment'].apply(tuple)

# Drop duplicates
df = df.drop_duplicates()

# Convert tuples back to lists (if necessary)
df['sentiment'] = df['sentiment'].apply(list)

# Update the len_sent_1 column to reflect the new length of sentiment lists
# df['len_sent_1'] = df['sentiment'].apply(len)

# Check the shape after dropping duplicates
print("Shape after dropping duplicates:", df.shape)

# Print the updated DataFrame
# print(df)

Shape after dropping duplicates: (61093, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].apply(list)


In [22]:
def remove_single_letters(text):
    return [word for word in text if len(word) > 1]

# Apply the function to the 'sentiment' column
df['sentiment'] = df['sentiment'].apply(remove_single_letters)

In [23]:
df["len_sent"] = df['sentiment'].apply(lambda X: len(X))
df.head()

Unnamed: 0,target,sentiment,has_emoji,len_sent
0,Positive,"[getting, borderlands, murder]",False,3
1,Positive,"[coming, borders, kill]",False,3
2,Positive,"[getting, borderlands, kill]",False,3
3,Positive,"[coming, borderlands, murder]",False,3
6,Positive,"[spent, hours, making, something, fun, know, h...",False,26


In [24]:
df = df[df['len_sent'] != 0]
df.shape

(61072, 4)

In [25]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def lemmatization(text):

    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()

        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'

        result.append(wordnet.lemmatize(token,pos))

    return result


df['sentiment']=df['sentiment'].apply(lemmatization)
df.head()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,target,sentiment,has_emoji,len_sent
0,Positive,"[get, borderland, murder]",False,3
1,Positive,"[come, border, kill]",False,3
2,Positive,"[get, borderland, kill]",False,3
3,Positive,"[come, borderland, murder]",False,3
6,Positive,"[spent, hour, make, something, fun, know, huge...",False,26


In [26]:
df.drop(columns=["has_emoji","len_sent"],inplace=True)

In [27]:
df['sentiment_joined'] = df['sentiment'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,target,sentiment,sentiment_joined
0,Positive,"[get, borderland, murder]",get borderland murder
1,Positive,"[come, border, kill]",come border kill
2,Positive,"[get, borderland, kill]",get borderland kill
3,Positive,"[come, borderland, murder]",come borderland murder
6,Positive,"[spent, hour, make, something, fun, know, huge...",spent hour make something fun know huge border...


Model Training
==============

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['sentiment_joined'])

In [29]:
from sklearn.preprocessing import LabelEncoder

# Convert target column to numerical values
le = LabelEncoder()
y = le.fit_transform(df['target'])

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [31]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [44]:
model = KNeighborsClassifier()

# Train the ExtraTreesRegressor
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

y_pred_train = model.predict(X_train)

r2_test = accuracy_score(y_test, y_pred)

r2_train = accuracy_score(y_train, y_pred_train)
print(r2_test,r2_train)

0.9154318460908719 0.9620525206214053


In [37]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      3740
           1       0.92      0.95      0.93      5204
           2       0.96      0.81      0.88      3271

    accuracy                           0.92     12215
   macro avg       0.92      0.91      0.91     12215
weighted avg       0.92      0.92      0.91     12215



In [40]:
pd.Series(y_train).value_counts()

1    20815
0    14961
2    13081
Name: count, dtype: int64

In [50]:
a = "it is a worst course to learn "
a = a.lower()
a = " ".join(a.split())
a = word_tokenize(a)
a = remove_stopwords(a)
a = remove_punct(a)
a = keep_alphabetical_only(a)
a = remove_single_letters(a)
a = lemmatization(a)
a = " ".join(a)
a

'worst course learn'

In [51]:
aa_transformed = vectorizer.transform([a])

# Predict the sentiment using the loaded model
y_pred = model.predict(aa_transformed)
print(y_pred)

[0]
