<a href="https://colab.research.google.com/github/Manar-Emad75/NLP_Project/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import nltk
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.stem import WordNetLemmatizer

In [101]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [102]:
zip_file_path = '/content/pos.zip'
extracted_folder_path = '/content/extracted_folder'
os.makedirs(extracted_folder_path, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)


zip_file_path = '/content/neg.zip'
extracted_folder_path = '/content/extracted_folder'
os.makedirs(extracted_folder_path, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

folderNeg_path = '/content/extracted_folder/neg'
folderPos_path = '/content/extracted_folder/pos'

#Read text files and put them in list

In [103]:
def read_text_files(folder_path):
    texts = []
    files = os.listdir(folder_path)
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as f:
                text = f.read()
                texts.append(text)
    return texts

In [104]:
texts_folderNeg = read_text_files(folderNeg_path)
texts_folderPos = read_text_files(folderPos_path)

# access spacifice file in list
neg_file = texts_folderNeg[20]

In [105]:
def preprocess_text(text):

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenization and Removing stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]

    # Joining the lemmatized text back into a single string
    processed_text = ' '.join(lemmatized_text)

    return processed_text

#Load data as train(X) and test(Y)

In [106]:
def load_data(folder):
    reviews = []
    labels = []
    for file in os.listdir(folder):
        with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
            review = f.read()
            processed_review = preprocess_text(review)
            reviews.append(processed_review)
            labels.append(folder.split('/')[-1])  # Extract label from folder name
    return reviews, labels

In [107]:
positive_reviews, positive_labels = load_data(folderPos_path)
negative_reviews, negative_labels = load_data(folderNeg_path)

#Combine all positive and negative file to train model

In [108]:
# Combine positive and negative reviews and labels
all_reviews = positive_reviews + negative_reviews
all_labels = positive_labels + negative_labels

#Apply TF-IDF

In [109]:
# Feature Extraction TF-IDF
vectorizer = TfidfVectorizer()

#define train and test
X = vectorizer.fit_transform(all_reviews)
y = all_labels

# Data Spliting

In [110]:
# Model Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Classifier

In [111]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict the labels of the test data
y_pred = classifier.predict(X_test)

# Visualizing Results
print("\nAccuracy: {:.2f} %".format(accuracy_score(y_test, y_pred) * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 84.00 %

Classification Report:
               precision    recall  f1-score   support

         neg       0.86      0.82      0.84       201
         pos       0.82      0.86      0.84       199

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



In [112]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Predict the labels of the test data
y_pred = svm_classifier.predict(X_test)

# Visualizing Results
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 85.00%
Classification Report:
               precision    recall  f1-score   support

         neg       0.85      0.85      0.85       201
         pos       0.85      0.85      0.85       199

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



#func to colored output

In [116]:
from IPython.display import display, HTML

# Define HTML for displaying text in specified color and style
def colored_text(text, color='black', style='normal'):
    color_code = {'black': 'black', 'red': 'red', 'green': 'green'}
    style_code = {'normal': 'normal', 'bold': 'bold'}

    return f"<span style='color:{color_code[color]}; font-weight:{style_code[style]}'>{text}</span>"

#Define if file is positive or negative

In [117]:
# aplay pre-processing to file
processed_review = preprocess_text(neg_file)

# Transform the preprocessed review into a TF-IDF vector
review_vector = vectorizer.transform([processed_review])

# Classify the sentiment using the trained classifier in LogisticRegression
sentiment = classifier.predict(review_vector)[0]

if sentiment == 'neg':
    display(HTML(colored_text('Negative Review', color='red', style='bold')))
else:
    display(HTML(colored_text('Positive Review', color='green', style='bold')))

#Saving the model

In [115]:
import joblib
import matplotlib.pyplot as plt
model_filename = 'sentiment_classifier_model.pkl'
joblib.dump(classifier, model_filename)

# Check if the model file exists
if os.path.exists(model_filename):
    print("Model has been saved successfully.")
else:
    print("Model could not be saved. Please check the file path.")

Model has been saved successfully.
