<a href="https://colab.research.google.com/github/Gamgom72/Natural-Language-Processing-Project/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
drive.mount('/content/MyDrive/',force_remount = True)

Mounted at /content/MyDrive/


In [None]:
def read_text_file(file_path):
    file = ""
    with open(file_path, 'r') as f:
        file+=f.read()
    return file

In [None]:
def read_files(path,classType):
    files = []
    for file in os.listdir(path):
        if file.endswith(".txt"):
            file_path = f"{path}/{file}"
            speech = read_text_file(file_path)
            files.append([speech,classType])
    return files

In [None]:
def read_data():
    negative_data= "/content/MyDrive/MyDrive/txt_sentoken/neg"
    positive_data = "/content/MyDrive/MyDrive/txt_sentoken/pos"
    #os.chdir(negative_data)
    negative_dataset = read_files(negative_data,0)
    #os.chdir(positive_data)
    positive_dataset = read_files(positive_data,1)
    return positive_dataset,negative_dataset

In [None]:
def remove_stop_words(speech_words):
    stop_words = set(stopwords.words("english"))
    filtered_speech_words = [ word for word in speech_words if word.casefold() not in stop_words]
    return filtered_speech_words

In [None]:
def remove_numbers(speech):
    speech = re.sub("\d+",' ', speech)
    return speech

In [None]:
def remove_punctuation(filtered_speech_words):
    no_punct_filtered_speech_words = [''.join(char for char in word if char not in string.punctuation) for word in filtered_speech_words]
    no_punct_filtered_speech_words = [word for word in no_punct_filtered_speech_words if word] # To remove empty strings
    return no_punct_filtered_speech_words

In [None]:
def stemming(speech_words):
    porter_stemmer = PorterStemmer()
    word_stem = [porter_stemmer.stem(word) for word in speech_words]
    return word_stem

In [None]:
def preprocessing(dataset):
  data = []
  for speech in dataset:
    speech[0] = speech[0].lower()
    #speech[0] = remove_numbers(speech[0])
    speech_words = word_tokenize(speech[0])
    speech_words = remove_stop_words(speech_words)
    speech_words = remove_punctuation(speech_words)
    speech_words = stemming(speech_words)
    speech[0] = ' '.join(word for word in speech_words)
    data.append(speech)
  
  return data


In [None]:
def loading_data():
    pos_data,neg_data = read_data()
    pos_data = preprocessing(pos_data)
    pos_data = pd.DataFrame(pos_data)
    pos_data.columns = ['Review','Label']
    pos_data['Label'] = [1 for i in range(1000)]
    neg_data = preprocessing(neg_data)
    neg_data = pd.DataFrame(neg_data)
    neg_data.columns = ['Review','Label']
    neg_data['Label'] = [0 for i in range(1000)]
    dataset = pd.concat([pos_data,neg_data],ignore_index=True,sort=False)
    return dataset

In [None]:
dataset = loading_data()
dataset

Unnamed: 0,Review,Label
0,film adapt comic book plenti success whether r...,1
1,everi movi come along suspect studio everi ind...,1
2,ve got mail work alot better deserv order make...,1
3,jaw rare film grab attent show singl imag scre...,1
4,moviemak lot like gener manag nfl team postsal...,1
...,...,...
1995,season need anoth serial killer movi like kath...,0
1996,plot separ glamor hollywood coupl must pretend...,0
1997,talent actress bless demonstr wide act rang ot...,0
1998,susan granger s review ghost mar soni pictur e...,0


In [None]:
cv=TfidfVectorizer()
x=cv.fit_transform(dataset['Review'])
data = pd.DataFrame(x.toarray(),columns=cv.get_feature_names_out())
data['Label'] = dataset['Label']

In [None]:
col = data.shape[1]
print(col)
data

31265


Unnamed: 0,00,000,0009f,000acr,000aweek,000foot,007,007esqu,010,03,...,zuko,zukovski,zulu,zundel,zurg,zweibel,zwick,zwigoff,zyci,Label
0,0.066715,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.000000,0.023751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1996,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1997,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1998,0.000000,0.055940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
data = data.sample(frac = 1)
X = data.iloc[:,0:31264]
Y = data['Label']
X_train, X_test,y_train, y_test = train_test_split(X,Y,
                                   random_state=104, 
                                   test_size=0.25, 
                                   shuffle=False)


In [None]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

In [None]:
score = logisticRegr.score(X_test, y_test)
print("Accuracy Logistic Regression Model")
print(score*100)

Accuracy Logistic Regression Model
80.80000000000001


In [None]:
from sklearn.svm import SVC
SVCClf = SVC(kernel = 'linear',gamma = 'scale', shrinking = False,)
SVCClf.fit(X_train, y_train)

In [None]:
score = SVCClf.score(X_test, y_test)
print("Accuracy SVM Model")
print(score*100)

Accuracy SVM Model
83.2


In [None]:
SVCClf = SVC(kernel = 'poly',gamma = 'scale', shrinking = False,)
SVCClf.fit(X_train, y_train)

In [None]:
score = SVCClf.score(X_test, y_test)
print("Accuracy SVM Model")
print(score*100)

Accuracy SVM Model
73.2


In [None]:


from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(X_test, pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
import matplotlib.pyplot as plt
cm_display.plot()
plt.show()




NameError: ignored