##### CCS/00081/019 SHALLON SAID
##### CCS/00009/019 ERICK MURIITHI
##### CCS/00190/019 TECLA BIWOTT
### Text Preprocessing


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\murii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Text preprocessing steps achieved:Tokenization,Lowercasing,stopword removal and removing special characters and numbers

In [5]:

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('movie_reviews.csv')

# Remove any non-alphanumeric characters and convert to lowercase
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.lower()))

# Tokenize the reviews
df['review'] = df['review'].apply(lambda x: nltk.word_tokenize(x))

# Remove stop words from the reviews
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

# Stem the words in the reviews
ps = PorterStemmer()
df['review'] = df['review'].apply(lambda x: [ps.stem(word) for word in x])

# Join the words back into sentences
df['review'] = df['review'].apply(lambda x: ' '.join(x))

# Save the preprocessed data back to the CSV file
df.to_csv('preprocessed_movie_reviews.csv', index=False)

##### Vectorize your texts with one of the document representation echniques discussed in class

In [6]:
# Load the preprocessed data into a pandas DataFrame
df = pd.read_csv('preprocessed_movie_reviews.csv')

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the preprocessed reviews
vectorizer.fit(df['review'])

# Transform the preprocessed reviews into a bag-of-words matrix
bow_matrix = vectorizer.transform(df['review'])


##### Train and test a classification model using the corpus you have created:Implemented a Decision Tree

In [7]:

# Load the preprocessed data and bag-of-words matrix into a pandas DataFrame
df = pd.read_csv('preprocessed_movie_reviews.csv')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, df['sentiment'], test_size=0.2)

# Create a decision tree classifier object
clf = DecisionTreeClassifier()

# Train the classifier on the training data
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

##### Use precision, recall and f-score to measure your model performance 


In [9]:
# Test the classifier on the testing data
accuracy = clf.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average='weighted',zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0
