In [1]:
import os
import numpy as np
import pandas as pd
import pyprind

In [2]:
basepath = '/Users/jamilsharif/Desktop/aclImdb'
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)



In [3]:
# Initialize a DataFrame to store the data
df = pd.DataFrame()

In [4]:
dfs = []  # Initialize an empty list to store DataFrames
# Loop over the 'test' and 'train' directories
for s in ('test', 'train'):
    # Loop over the 'pos' and 'neg' subdirectories
    for l in ('pos', 'neg'):
        # Construct the path to the directory containing the files
        path = os.path.join(basepath, s, l)
        try:
            # Get the list of files in the directory
            files = sorted(os.listdir(path))
        except FileNotFoundError:
            # Handle the case where the directory is not found
            print(f"Directory '{path}' not found.")
            continue
        # Initialize a list to store data from current directory
        data = []
        # Loop over the files in the directory
        for file in files:
            # Read the contents of the file
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            # Append the text and label to the list
            data.append([txt, labels[l]])
            # Update the progress bar
            pbar.update()
        # Convert the list to a DataFrame and append to the list of DataFrames
        dfs.append(pd.DataFrame(data, columns=['review', 'sentiment']))

# Concatenate all DataFrames in the list
df = pd.concat(dfs, ignore_index=True)

In [5]:
# Shuffle the DataFrame
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv('/Users/jamilsharif/SentimentClassification/movie_data.csv', index=False, encoding='utf-8')

In [6]:
df = pd.read_csv('/Users/jamilsharif/SentimentClassification/movie_data.csv', encoding='utf-8')

# Display the first few rows of the DataFrame
print(df.head())

                                              review  sentiment
0  Calling this a romantic comedy is accurate but...          1
1  I'm not aware of "Largo Winch" as a comic book...          1
2  The stranger Jack (Matthew Lillard) arrives in...          0
3  This film fails on many many levels. The scrip...          0
4  Gregory Peck gives a brilliant performance in ...          1


In [7]:
df.shape

(47864, 2)

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [9]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [11]:
df.loc[0, 'review'][-50:]

'genre and borrows a plot element, but that is all.'

In [12]:
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer

In [13]:
# Define preprocessor function to clean the text data
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [14]:
# Test the preprocessor function
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))

genre and borrows a plot element but that is all 
this is a test :) :( :)


In [15]:
# Apply preprocessor function to all reviews in the DataFrame
df['review'] = df['review'].apply(preprocessor)

In [16]:
# Define tokenizer function to split documents into individual words
def tokenizer(text):
    return text.split()

In [17]:
# Test the tokenizer function
print(tokenizer('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']


In [18]:
# Define tokenizer function with Porter stemming
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [21]:
import nltk

# Download English stop-words
# nltk.download('stopwords') # comment this out to download and then comment back

from nltk.corpus import stopwords

# Load English stop-words
stop = stopwords.words('english')

# Example text
text = "a runner likes running and runs a lot"

# Tokenize text and remove stop-words
filtered_words = [w for w in tokenizer_porter(text) if w not in stop]

print(filtered_words)

['runner', 'like', 'run', 'run', 'lot']


In [22]:
X_train = df.loc[:5000, 'review'].values
y_train = df.loc[:5000, 'sentiment'].values

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': [None],
    'vect__tokenizer': [None],  # Use default tokenizer
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1.0, 10.0],
}

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer()

# Initialize Logistic Regression classifier
clf = LogisticRegression(random_state=0, solver='liblinear')

# Create a Pipeline
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', clf)
])

# Initialize GridSearchCV with multiprocessing
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=2,
                           n_jobs=-1)  # Utilize all available CPU cores

# Fit the model
gs_lr_tfidf.fit(X_train, y_train)

# Print the best parameter set
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=None; total time=   3.3s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=None; total time=   3.4s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=None; total time=   3.4s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=None; total time=   3.4s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=None; total time=   2.8s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 2), vect__stop_words=None, vect__tokenizer=None; total time=  11.6s
[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 2), vect__stop_words=None, vect__tokenizer=None; total time=  11.7s
[CV] END clf__C=1.0, clf__penalty=l1, vect__

In [24]:
# Print the average 5-fold cross-validation accuracy score on the training dataset
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

# Get the best estimator from the grid search
clf = gs_lr_tfidf.best_estimator_

# Splitting the data into training and testing sets
X_test = df.loc[5000:, 'review'].values
y_test = df.loc[5000:, 'sentiment'].values

# Print the classification accuracy on the test dataset
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

CV Accuracy: 0.868
Test Accuracy: 0.872


In [25]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [26]:
next(stream_docs(path='/Users/jamilsharif/SentimentClassification/movie_data.csv'))

('"Calling this a romantic comedy is accurate but nowadays misleading. The genre has sadly deteriorated into cliches, too focused on making the main couple get together and with very little room for ambience and other stories, making it formulaic and overly predictable.<br /><br />The Shop Around the Corner does not suffer from these illnesses: it manages to create a recognisably middle/eastern-European atmosphere and has a strong cast besides the (also strong) nominal leads; I avoid using the words \'supporting cast\' as for example Mr. Matuschek (Frank Morgan) has a central role to the film and his story is equally if not more important than the romance.<br /><br />The 1998 film You\'ve Got Mail borrowed the \'anonymous pen-pal\' idea from this film and has therefore been billed as a remake. This is not correct and in fact unfair to the new movie - it shares the genre and borrows a plot element, but that is all."',
 1)

In [27]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [28]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

In [29]:
clf = SGDClassifier(loss='log_loss', random_state=1)

doc_stream = stream_docs(path='/Users/jamilsharif/SentimentClassification/movie_data.csv')

In [30]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])



In [31]:
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        doc_stream = stream_docs(path='/Users/jamilsharif/SentimentClassification/movie_data.csv')  # Reset doc_stream
        continue
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [32]:
# Now, get the test data
X_test, y_test = get_minibatch(doc_stream, size=5000)
if X_test is not None:  # Check if test data exists
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test, y_test))

    # Update the model with the last 5,000 documents
    clf = clf.partial_fit(X_test, y_test)
else:
    print("No test data available.")

No test data available.


In [33]:
import pandas as pd

df = pd.read_csv('/Users/jamilsharif/SentimentClassification/movie_data.csv', encoding='utf-8')

In [34]:
import numpy as np
from sklearn.decomposition import NMF

# Assuming you have a sample input matrix X
X = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

# Define the number of components for NMF
n_components = 2  # Adjust as needed

# Initialize and fit NMF model
nmf = NMF(n_components=n_components, random_state=123, max_iter=500)
W = nmf.fit_transform(X)  # W represents document-topic matrix
H = nmf.components_  # H represents topic-word matrix

In [35]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Assuming you have your original feature matrix X

# Perform PCA for dimensionality reduction
pca = PCA(n_components=3)  # Adjust the number of components based on the number of features
X_reduced = pca.fit_transform(X)

# Apply min-max scaling to ensure non-negative values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_reduced)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

In [37]:
count = CountVectorizer(stop_words='english', max_df=0.1, max_features=5000)
X = count.fit_transform(df['review'].values)

In [38]:
n_components = 5  # Adjust as needed
nmf = NMF(n_components=n_components, random_state=123, max_iter=500)
X_topics_nmf = nmf.fit_transform(X)  # Assuming X is your input data

In [51]:
print("NMF Components Shape:", nmf.components_.shape)

NMF Components Shape: (5, 5000)


In [52]:
# Assuming nmf is your trained NMF model
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 1:
guy minutes action worst ll
Topic 2:
horror house gore blood effects
Topic 3:
series episode tv original episodes
Topic 4:
war american soldiers men german
Topic 5:
role family performance woman father


In [53]:
# Assuming X_topics_nmf is your topic distribution matrix obtained from NMF
horror = X_topics_nmf[:, 4].argsort()[::-1]  # Accessing the 5th topic (index 4)
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Horror movie #1:
A DOUBLE LIFE has developed a mystique among film fans for two reasons: the plot idea of an actor getting so wrapped up into a role (here Othello) as to pick up the great flaw of that character and put it into his life; and that this is the film that won Ronald Colman the Academy Award (as well as t ...

Horror movie #2:
If you cannot enjoy a chick flick, stop right now. If, however, you enjoy films that illustrate complex characters and provide extraordinary acting, read on.<br /><br />Ann Grant Lord is dying. Her two daughters arrive to be at her bedside. Ann begins talking about people from her past of whom the d ...

Horror movie #3:
*!!- SPOILERS - !!*<br /><br />Before I begin this, let me say that I have had both the advantages of seeing this movie on the big screen and of having seen the "Authorized Version" of this movie, remade by Stephen King, himself, in 1997.<br /><br />Both advantages made me appreciate this version of ...


In [62]:
import pickle
import os

dest = os.path.join('/Users/jamilsharif/SentimentClassification/movieclassifier/pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop,
            open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
            protocol=4)

pickle.dump(clf,
            open(os.path.join(dest, 'classifier.pkl'), 'wb'),
            protocol=4)

In [63]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname('/Users/jamilsharif/SentimentClassification/movieclassifier/pkl_objects')
stop = pickle.load(open(os.path.join(
                   cur_dir, 'pkl_objects', 'stopwords.pkl'),
                   'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
                    + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

In [65]:
import pickle
import re
import os
from vectorizer import vect

clf = pickle.load(open(os.path.join(
                   'pkl_objects', 'classifier.pkl'),
                   'rb'))

In [66]:
import numpy as np

label = {0:'negative', 1:'positive'}
example = ["I love this movie. It's amazing."]
X = vect.transform(example)

print('Prediction: %s\nProbability: %.2f%%' %\
      (label[clf.predict(X)[0]],
       np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 95.55%


In [67]:
import sqlite3
import os

# Create a connection to the SQLite database file
conn = sqlite3.connect('reviews.sqlite')

# Create a cursor object to execute SQL commands
c = conn.cursor()

# Drop the table if it already exists
c.execute('DROP TABLE IF EXISTS review_db')

# Create a new table named review_db
c.execute('CREATE TABLE review_db'
          ' (review TEXT, sentiment INTEGER, date TEXT)')

# Insert example movie reviews into the database
example1 = 'I love this movie'
c.execute("INSERT INTO review_db"
          " (review, sentiment, date) VALUES"
          " (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db"
          " (review, sentiment, date) VALUES"
          " (?, ?, DATETIME('now'))", (example2, 0))

# Commit the changes to the database
conn.commit()

# Close the connection
conn.close()

In [68]:
# Reopen the connection to the SQLite database
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

# Fetch all rows in the database table within a specific date range
c.execute("SELECT * FROM review_db WHERE date"
          " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()

# Close the connection
conn.close()

# Print the fetched results
print(results)


[('I love this movie', 1, '2024-05-01 01:49:06'), ('I disliked this movie', 0, '2024-05-01 01:49:06')]


In [69]:
from flask import Flask, render_template, request
from wtforms import Form, TextAreaField, validators
import pickle
import sqlite3
import os
import numpy as np
from vectorizer import vect  # Import HashingVectorizer from local dir

In [70]:
app = Flask(__name__)

In [72]:
cur_dir = os.path.dirname('/Users/jamilsharif/SentimentClassification/movieclassifier/pkl_objects')
clf = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'classifier.pkl'), 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')

def classify(document):
    label = {0: 'negative', 1: 'positive'}
    X = vect.transform([document])
    y = clf.predict(X)[0]
    proba = np.max(clf.predict_proba(X))
    return label[y], proba

def train(document, y):
    X = vect.transform([document])
    clf.partial_fit(X, [y])

def sqlite_entry(path, document, y):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    c.execute("INSERT INTO review_db (review, sentiment, date)"\
              " VALUES (?, ?, DATETIME('now'))", (document, y))
    conn.commit()
    conn.close()