In [None]:
import nltk # Import nltk 
import os # Import operating sysytem 
import pandas as pd # Import pandas 
import numpy as np # Import numpy 
from nltk.tokenize import word_tokenize # Import word_tokenize
import re # Import regular expression
from nltk.corpus import stopwords  # Import Stop words
from nltk.stem import PorterStemmer,SnowballStemmer   # Import PorterStemmer,SnowballStemmer
# from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
# nltk.download('wordnet')

# Read Dataset

In [None]:
DATA_PATH = '/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/' # Path of Data 

### Read Train Data

In [None]:
train_df =pd.read_csv(os.path.join(DATA_PATH,'Train.csv'))  # read train dataset
train_df

### Exploring Train Dataset

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df['label'].value_counts()

### Read Test Data

In [None]:
test_df = pd.read_csv(os.path.join(DATA_PATH,'Test.csv'))  # read test dataset
test_df

### Exploring Test Dataset

In [None]:
test_df.head()

In [None]:
test_df.tail()

In [None]:
test_df.shape

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
test_df['label'].value_counts()

# Pre_processing

In [None]:
#example text
example_text=train_df['text'][10]
example_text

### 1_Puncutation

In [None]:
example_text_pucutation=re.sub('[^a-zA-Z\']'," ",example_text)
example_text_pucutation

### 2_lower_case

In [None]:
example_text_lower=example_text_pucutation.lower()
example_text_lower

### 3_Tokenization

In [None]:
# #with split
example_text_split=example_text_lower.split()
print(example_text_split)
print('-------------------------------------------------------------------------')
# with tokenization
example_text_tokenization=word_tokenize(example_text_lower)
print(example_text_tokenization)
print('-------------------------------------------------------------------------')

### 4_stop_words

In [None]:
stop_words=stopwords.words('english')
final_stop_words=[]
negative= ['don', "don't",'no', 'nor', 'not' ,"aren't",'aren','couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'can']
for word in stop_words:
    if word not in negative:
        final_stop_words.append(word) 
print(final_stop_words)
        

In [None]:
example_text_after_stopword = []
for word in example_text_tokenization:
    if word not in final_stop_words:
        example_text_after_stopword.append(word)
        
print(example_text_after_stopword)        


### 5_Stemming

In [None]:
#stemmer=SnowballStemmer(language='english')
stemmer=PorterStemmer()
exmaple_after_stemming=[]
for word in example_text_after_stopword:
    text_stem=stemmer.stem(word)
    exmaple_after_stemming.append(text_stem)
    
print(exmaple_after_stemming)   

### lemmitization

In [None]:
# lm=WordNetLemmatizer()
# exmaple_after_lemmatization=[]
# for word in example_text_after_stopword:
#     text_lemmatization=lm.lemmatize(word)
#     exmaple_after_lemmatization.append(text_stem)
# print(exmaple_after_lemmatization)  

In [None]:
example_text_join=' '.join(exmaple_after_stemming)  # join tokens together 
example_text_join

### pre_processing function

In [None]:
def preprocessing(text): 
    # remove puctuation 
    text=re.sub('[^a-zA-Z\']'," ",text) 
    # lower capitalization  
    text=text.lower()
    # word tokenize 
    tokens=word_tokenize(text) 
    # remove stopwords 
    tokens=[token for token in tokens if token not in final_stop_words] 
    # stemming or lemmitization
    lm=WordNetLemmatizer()
    ps = PorterStemmer() 
    tokens=[ps.stem(word)for word in tokens] 
    preprocessed_text=' '.join(tokens) 
    return preprocessed_text

In [None]:
train_df['processed_data'] = train_df['text'].apply(preprocessing)
train_df

In [None]:
test_df['processed_data'] = test_df['text'].apply(preprocessing)
test_df

### Vectorization

In [None]:
# Import CountVectorizer and TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#cv = CountVectorizer()
cv = TfidfVectorizer()

In [None]:
x_train = cv.fit_transform(train_df['processed_data'])
y_train = train_df['label']
print(x_train.shape)
print(x_train)


In [None]:
x_test = cv.transform(test_df['processed_data'])
y_test = test_df['label']
print(x_test.shape)
print(x_test)


# Machine learning models

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score , classification_report

### 1_Random forest algorthim

In [None]:
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier

In [None]:
rf_model=RandomForestClassifier()  # object from RandomForestClassifier to use it

In [None]:
rf_model.fit(x_train,y_train)  # train model

In [None]:
y_predict=rf_model.predict(x_test) # test model
y_predict

In [None]:
print (confusion_matrix(y_test, y_predict))

In [None]:
print (accuracy_score(y_test, y_predict))

In [None]:
print (classification_report(y_test, y_predict))

### 2_Navie bayes algorthim

In [None]:
from sklearn.naive_bayes import MultinomialNB  # Import naive_bayes Classifier

In [None]:
multi_model = MultinomialNB()   # object from MultinomialNB to use it

In [None]:
multi_model.fit(x_train, y_train)  # train model

In [None]:
y_predict = multi_model.predict(x_test) # test model
y_predict

In [None]:
print (confusion_matrix(y_test, y_predict))   

In [None]:
print (accuracy_score(y_test, y_predict))

In [None]:
print (classification_report(y_test, y_predict))

### 3_knn algorthim

In [None]:
from sklearn.neighbors import NearestCentroid # Import Knn Classifier


In [None]:
knn_model=NearestCentroid() # object from NearestCentroid to use it

In [None]:
knn_model.fit(x_train,y_train) # train model

In [None]:
y_predict=knn_model.predict(x_test)  # test model
y_predict

In [None]:
print (confusion_matrix(y_test, y_predict))

In [None]:
print (accuracy_score(y_test, y_predict))

In [None]:
print (classification_report(y_test, y_predict))

### 4_Decision tree algorthim

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier


In [None]:
dt_model=DecisionTreeClassifier()  # object from NearestCentroid to use it

In [None]:
dt_model.fit(x_train,y_train)  # train model

In [None]:
y_predict=dt_model.predict(x_test)  # test model
y_predict

In [None]:
print (confusion_matrix(y_test, y_predict))

In [None]:
print (accuracy_score(y_test, y_predict))

In [None]:
print (classification_report(y_test, y_predict))