<a href="https://colab.research.google.com/github/JehadOumer/IMDB-Reviews-Classification/blob/main/Classifiers/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import json

In [3]:
training = pd.read_csv('/content/drive/MyDrive/IMDB Review Dataset Processed/processed_training_data.csv')
validation = pd.read_csv('/content/drive/MyDrive/IMDB Review Dataset Processed/processed_validation_data.csv')
testing = pd.read_csv('/content/drive/MyDrive/IMDB Review Dataset Processed/processed_testing_data.csv')

In [None]:
training.head(5)

Unnamed: 0.1,Unnamed: 0,label,text
0,0,0,this be an absolutely terrible movie do not be...
1,1,0,pron have be know to fall asleep during film b...
2,2,0,person photograph org in a superb fashion and ...
3,3,1,this be the kind of film for a snowy date time...
4,4,1,as other have mention all the woman that go nu...


In [4]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
count_vectorizer.fit(training['text'])
vectorized = count_vectorizer.transform(training['text'])
print(dict(list(count_vectorizer.vocabulary_.items())[0:8]))
print('The transformed data matrix dimensions:', vectorized.shape)

{'absolutely': 8291, 'terrible': 1137108, 'movie': 719798, 'lure': 662867, 'person': 813247, 'great': 482769, 'actor': 18086, 'simply': 1034229}
The transformed data matrix dimensions: (30000, 1279860)


In [5]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(vectorized)
transformed_tfidf = tfidf_transformer.transform(vectorized)
print(transformed_tfidf.shape)
print(transformed_tfidf[2])


(30000, 1279860)
  (0, 1276252)	0.12744135201161827
  (0, 1275965)	0.04746691098944858
  (0, 1251329)	0.10408102932206548
  (0, 1251092)	0.06380597224781188
  (0, 1246086)	0.12744135201161827
  (0, 1246071)	0.07117770774248602
  (0, 1220044)	0.1225738118688427
  (0, 1219426)	0.04422434335648769
  (0, 1185186)	0.12744135201161827
  (0, 1184992)	0.04817839949422897
  (0, 1164539)	0.12744135201161827
  (0, 1164025)	0.051529252645216794
  (0, 1134139)	0.08928935252811744
  (0, 1133577)	0.03647313765762613
  (0, 1110255)	0.12744135201161827
  (0, 1110138)	0.05681306038752741
  (0, 1092384)	0.12744135201161827
  (0, 1092302)	0.055849589195996883
  (0, 1018746)	0.07809100556846
  (0, 1018663)	0.06877875266100374
  (0, 978038)	0.12744135201161827
  (0, 977927)	0.06769650396744183
  (0, 951777)	0.11912023717643695
  (0, 951702)	0.06693467313665075
  (0, 946927)	0.10325278155225119
  :	:
  (0, 352265)	0.10710664020607569
  (0, 350132)	0.11912023717643695
  (0, 350126)	0.09533864911921466
  (0, 3

In [6]:
pipeline = Pipeline([
        ('count_vectorizer', CountVectorizer(stop_words='english', ngram_range=(1,2))),  
        ('tfidf_transformer', TfidfTransformer()),  
        ('classifier', LinearSVC())
    ])

In [7]:
pipeline.fit(training['text'], training['label'])
prediction = pipeline.predict(validation['text'])


In [8]:
print(classification_report(validation['label'], prediction))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5025
           1       0.88      0.90      0.89      4975

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [9]:
def get_metrics(predicted,true):
    metrics = dict()
    metrics['accuracy'] = round(accuracy_score(predicted, true), 5)
    metrics['precision'] = round(precision_score(predicted, true, average = 'weighted'), 5)
    metrics['recall'] = round(recall_score(predicted, true, average = 'weighted'), 5)
    metrics['f1'] = round(f1_score(predicted, true, average = 'weighted'), 5)
    
    return metrics

In [10]:
metrics = get_metrics(prediction, validation['label'])
print(metrics)

{'accuracy': 0.8874, 'precision': 0.88761, 'recall': 0.8874, 'f1': 0.88741}


In [11]:
with open("SVM_results.json", "w") as output:
    json.dump(metrics, output)