In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("data/movie_review.csv")

In [3]:
df.head(5)

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


# Cleaning data

In [4]:
del df["fold_id"]
del df["html_id"]
del df["sent_id"]

In [5]:
del df["cv_tag"]
df.head(5)

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


In [6]:
df.describe()

Unnamed: 0,text,tag
count,64720,64720
unique,63652,2
top,.,pos
freq,123,32937


# Stemming

In [7]:
import nltk

In [8]:
df1 = df.sample(500)

In [9]:
df1.describe()

Unnamed: 0,text,tag
count,500,500
unique,500,2
top,dark city is what philosophers would call an e...,neg
freq,1,263


In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.lancaster import LancasterStemmer

def stem_text(text):
    st = LancasterStemmer()
    words = word_tokenize(text)
    new_words = list(map(lambda x:st.stem(x) ,words))
    return " ".join(new_words)

In [11]:
df1['text'] = df1['text'].apply(stem_text)

In [12]:
df1.describe()

Unnamed: 0,text,tag
count,500,500
unique,500,2
top,simon is a man who is a success in his circ of...,neg
freq,1,263


# Cleaning from stopwords

In [13]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(filtered_words)

In [14]:
df1['text'] = df1['text'].apply(remove_stopwords)

In [15]:
df1.head(5)

Unnamed: 0,text,tag
17905,imagin embarrass famy next day find mul stol f...,pos
43880,win everyth .,neg
59744,dumb dumb .,neg
52408,"ev without quaid , doe n't appear switchback w...",neg
62908,"charact also chang whim fit mech script , ther...",neg


In [16]:
df1.to_csv("data/processed_data1.csv")

In [17]:
df1 = df.sam

AttributeError: 'DataFrame' object has no attribute 'sam'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def get_vectorizer(data):
    vectorizer = CountVectorizer()
    vectorizer.fit(data)
    return vectorizer

In [None]:
vec = get_vectorizer(df1["text"])

In [None]:
vec.vocabulary_

In [None]:
unique_words = list(vec.vocabulary_.keys())

In [None]:
labels = df1["tag"]

In [None]:
del df1["tag"]

# Calculating TF-IDF matrix

In [None]:
import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer



def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems


tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(df1["text"])

str = 'all great and precious things are lonely.'
response = tfidf.transform([str])
print(response)

feature_names = tfidf.get_feature_names()
#for col in response.nonzero()[1]:
    #print (feature_names[col], ' - ', response[0, col])

In [None]:
tfs = tfs.todense()

In [None]:
tfs.shape

In [None]:
labels = pd.get_dummies(labels)["pos"]
labels

In [None]:
labels = labels.values

In [None]:
labels.shape

In [None]:
df_n = pd.DataFrame(tfs)

In [None]:
df_n['label'] = labels

In [None]:
df_n.shape

# Saving results

In [None]:
import pickle

pickle.dump(tfidf, open("tfidf.pickle", "wb"))

In [None]:
tfidf

In [None]:
with open('tfidf.pickle', 'rb') as f:
    tf2 = pickle.load(f)

In [None]:
tf2

In [None]:
tfs.shape

In [None]:
df_n.to_csv("data/matrixied.csv")