# Sentiment Analysis

# Importing modules

In [98]:
import nltk

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report

import pandas as pd

In [99]:
df = pd.read_csv('https://raw.githubusercontent.com/Ataullha/CSE476-Machine-Learning-Lab/main/IMDB%20Dataset.csv').head(100)
df.shape

(100, 2)

In [100]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     100 non-null    object
 1   sentiment  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [102]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

# Label Encoding

In [103]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [104]:
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [105]:
def remove_invalid_char(review):
    valid = ""
    review = review.lower()
    for ch in review:
        if ch in string.ascii_lowercase:
            valid += ch
        else:
            valid += " "
    return valid

def stemming(review):
    ans = []
    ps = PorterStemmer()
    word_list = review.split()
    for word in word_list:
        if word not in set(stopwords.words('english')) and word not in ['br']:
            ans.append(ps.stem(word))
            
    return " ".join(ans)
        

In [106]:
corpus = []
for index, row in df.iterrows():
    # print(row['review'], row['sentiment'])\\
    # row['sentiment'] = 100
    review = row['review']
    review = remove_invalid_char(review)
    
    review = stemming(review)
    corpus.append(review)
    # print(review)
    

In [107]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [108]:
X = corpus
y = df['sentiment']

In [109]:
X[:5]

['one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side',
 'won

In [110]:
y[:5]

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int32

# Feature Extraction

In [111]:
cv = TfidfVectorizer(max_features=500)

In [112]:
X = cv.fit_transform(corpus).toarray()

In [113]:
X[:5]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.11590469, 0.        ,
        0.12577851],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06917517, 0.10713544, ..., 0.        , 0.        ,
        0.        ]])

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)

# Model

In [115]:
model = SVC(
    kernel='linear',
)

In [116]:
model.fit(X_train, y_train)

SVC(kernel='linear')

In [117]:
model.score(X_test, y_test)

1.0

In [118]:
y_pred = model.predict(X_test)

In [119]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         5

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

