In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
columns = ["target","id","date","flag","user","tweet"]

In [4]:
path = "../all_datasets/nlp/training.1600000.processed.noemoticon.csv"

In [5]:
df = pd.read_csv( path, 
                 names=columns , 
                 encoding="ISO-8859-1" )

df.head()

Unnamed: 0,target,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Check for the null values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   tweet   1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
df["target"].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

# change 4 to 1
# 0--> negative tweet
# 1--> positive tweet

In [8]:
df["target"] = df["target"].replace( to_replace=4,value=1 )

In [9]:
df["target"].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [10]:
delete_columns = [ 'id', 'date', 'flag', 'user']

In [11]:
df.drop( columns=delete_columns,inplace=True )

df.head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### import all the required NLP packages

In [12]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming( content ):

    #  keep only the alphabets
    stemed_content = re.sub( '[^a-zA-Z]' , ' ',content )

    # convert into lower case
    stemed_content = stemed_content.lower()

    #  convert into list
    stemed_content = stemed_content.split()

    # convert into stems
    stemed_content = [ port_stem.stem(word) for word in  stemed_content if not word in stopwords.words('english') ]

    # join the list of strings
    stemed_content = ' '.join(stemed_content)
    
    # print(stemed_content)
    
    return stemed_content

In [None]:
df['tweet_stemp'] = df['tweet'].apply( stemming )

In [None]:
df.head()

### seperate the independent and dependent features

In [None]:
X = df['tweet_stemp'].astype(str).tolist()
y = df['target'].values

In [None]:
vectorizer = TfidfVectorizer()

### split the data into train and test

In [None]:
X_train , X_test , y_train , y_test = train_test_split( X , y , test_size = 0.25 , random_state = 42 )

In [None]:
X_train_vector = vectorizer.fit_transform( X_train )

X_test_vector = vectorizer.transform( X_test )

### Models importing

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
gnb = GaussianNB()
svc = SVC()
rfc = RandomForestClassifier()

### Random Forest training and results

In [None]:
rfc.fit( X_train_vector , y_train )
y_rfc_pred = rfc.predict(X_test)

In [None]:
print( confusion_matrix(y_test  , y_rfc_pred) )

print(f'Accuracy Score of Random Forest: {accuracy_score(y_test  , y_rfc_pred)}')

print( classification_report(y_test  , y_rfc_pred)  )


### Naive bayes training and results

In [None]:
# gnb.fit( X_train_vector , y_train )
# y_gnb_pred = gnb.predict(X_test)

In [None]:
# print( confusion_matrix(y_test  , y_gnb_pred) )

# print(f'Accuracy Score of Naive Bayes: {accuracy_score(y_test  , y_gnb_pred)}')

# print( classification_report(y_test  , y_gnb_pred)  )


### Support vector machine training and results

In [None]:
# svc.fit( X_train_vector , y_train )
# y_svc_pred = rfc.predict(X_test)

In [None]:
# print( confusion_matrix(y_test  , y_svc_pred) )

# print(f'Accuracy Score of SVC: {accuracy_score(y_test  , y_svc_pred)}')

# print( classification_report(y_test  , y_svc_pred)  )
