In [1]:
import pandas as pd
import numpy as np
import math
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import KFold
import re
import nltk as nl
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# 1. Reading Data 

In [2]:
data = pd.read_csv('shuffled_movie_data.csv')
data.shape

(50000, 2)

In [3]:
pronouns = ['i','me','my','mine','you','your','yours','we','us','our','ours','myself','yourself','no']
ex = '!'
stop = stopwords.words('english')
porter = PorterStemmer()
stopWords = [n for n in stop if n not in pronouns and n not in ex]
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\.', ' ', text)
    #text = re.sub(':', ' : ', text)
    emoticons = '(?::|;|=)(?:-)?(?:\)|\(|D|P)'
    #emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    #exclamation = re.findall('(!)',text.lower())
    text = re.sub('[\W]+' + emoticons + ']', ' ', text.lower()) 
    text = [w for w in text.split() if w not in stopWords]
    tokenized = [porter.stem(w) for w in text]
    return text

In [4]:
tokenizer('decked... :) bad')

['decked', ':)', 'bad']

# 2. Preparing Data

In [9]:
data.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


## 2.1. Dictionaries

positive-words: Is a file that contain positive words

negative-words: Is a file that contain negative words

positive-emoticons: Is a file that contain positive emoticons

negative-emoticons: Is a file that contain negative emoticons

pronouns-words: Is a file that contain first and second pronouns

In [10]:
filePositive = open('lexicon/positive-words.txt', encoding = "ISO-8859-1")
linesPositive = filePositive.read().split()
filePositive.close()

fileNegative = open('lexicon/negative-words.txt', encoding = "ISO-8859-1")
linesNegative = fileNegative.read().split()
fileNegative.close()

fileEmoPositive = open('lexicon/positive-emoticons.txt', encoding = "ISO-8859-1")
linesEmoPositive = fileEmoPositive.read().split()
fileEmoPositive.close()

fileEmoNegative = open('lexicon/negative-emoticons.txt', encoding = "ISO-8859-1")
linesEmoNegative = fileEmoNegative.read().split()
fileEmoNegative.close()

filePronouns = open('lexicon/pronouns-words.txt', encoding = "ISO-8859-1")
linesPronouns = filePronouns.read().split()
filePronouns.close()



## 2.2. Create new Matrix

### Colums

positiveWords: Number Positive words in a review 

negativeWords: Number of Negative words in a review 

positiveEmoticons: Number Positive Emoticons in a review 

negativeEmoticons: Number of Negative Emoticons in a review 

pronouns: Number of first and second person pronouns in a review

logTotalWords: log of total number of words in a review 

setiment: True value of sentiment of a review

In [None]:
reviewList = []
wordList = []
matrix = pd.DataFrame(columns=['positiveWords','negativeWords','positiveEmoticons','negativeEmoticons','pronouns','logTotalWords','sentiment'])
countRows = 0
countFreq = 0

for text in data.loc[:,'review']:
    countColumns=0
    #matrix[countRows][countColumns] = countRows
    #countColumns+=1
    wordList = tokenizer(text)
    reviewList.append(wordList)
    positiveWords = 0
    negativeWord = 0
    positiveEmoticons = 0
    negativeEmoticons = 0
    pronouns = 0
    exclamation = 0 
    noWord = 0 
    totalWords = 0
    for word in wordList:
        if word in linesPositive:
            positiveWords+=1
        if word in linesNegative:
            negativeWord+=1
        if word in linesEmoPositive:
            positiveEmoticons+=1
        if word in linesEmoNegative:
            negativeEmoticons+=1 
        if word in linesPronouns:
            pronouns+=1
        totalWords+=1
    logTotalWords = np.log(totalWords)
    row = [int(positiveWords),int(negativeWord),int(positiveEmoticons),int(negativeEmoticons),int(pronouns),logTotalWords,data.loc[countRows,'sentiment']]
    matrix.loc[countRows] = row
    if (countRows % 5000 == 0) or (countRows == 0):
        print("charging: ",countRows/500,"%")  
    countRows+=1

In [None]:
matrix.tail()

In [None]:
np.savetxt("newMatrix.csv", matrix, delimiter=",")

### Read matrix from the File "newMatrix.csv"

In [5]:
#Read from file "newMatrix.csv"
matrix = pd.read_csv('newMatrix.csv')
matrix.head()

Unnamed: 0,positiveWords,negativeWords,positiveEmoticons,negativeEmoticons,pronouns,logTotalWords,sentiment
0,7.0,13.0,0.0,0.0,1.0,4.9,1.0
1,12.0,9.0,0.0,0.0,12.0,4.9,0.0
2,11.0,14.0,0.0,0.0,6.0,5.1,0.0
3,4.0,0.0,0.0,0.0,7.0,3.8,1.0
4,3.0,2.0,0.0,0.0,3.0,4.2,0.0


### Matrix of Correlation 

In [6]:
corrmat = matrix.corr().abs()
corrmat

Unnamed: 0,positiveWords,negativeWords,positiveEmoticons,negativeEmoticons,pronouns,logTotalWords,sentiment
positiveWords,1.0,0.535596,0.011673,0.01006,0.362127,0.729543,0.228444
negativeWords,0.535596,1.0,0.024823,0.007532,0.335587,0.725455,0.217318
positiveEmoticons,0.011673,0.024823,1.0,0.017779,0.035927,0.003645,0.032023
negativeEmoticons,0.01006,0.007532,0.017779,1.0,0.005792,0.006,0.01598
pronouns,0.362127,0.335587,0.035927,0.005792,1.0,0.476654,0.029281
logTotalWords,0.729543,0.725455,0.003645,0.006,0.476654,1.0,0.004701
sentiment,0.228444,0.217318,0.032023,0.01598,0.029281,0.004701,1.0


In [7]:
matrix.insert(0, "theta", 0.5)

In [8]:
train = matrix.loc[0:40000,:]
test = matrix.loc[40000:50000,:]
train.shape

(40001, 8)

In [9]:
trainData = train.values
testData = test.values

In [10]:
trainData[0:10,:]

array([[ 0.5,  7. , 13. ,  0. ,  0. ,  1. ,  4.9,  1. ],
       [ 0.5, 12. ,  9. ,  0. ,  0. , 12. ,  4.9,  0. ],
       [ 0.5, 11. , 14. ,  0. ,  0. ,  6. ,  5.1,  0. ],
       [ 0.5,  4. ,  0. ,  0. ,  0. ,  7. ,  3.8,  1. ],
       [ 0.5,  3. ,  2. ,  0. ,  0. ,  3. ,  4.2,  0. ],
       [ 0.5,  9. ,  2. ,  0. ,  0. ,  6. ,  4.3,  1. ],
       [ 0.5, 11. ,  3. ,  0. ,  0. ,  5. ,  5.2,  1. ],
       [ 0.5,  3. ,  2. ,  0. ,  0. ,  1. ,  4.3,  1. ],
       [ 0.5,  5. ,  2. ,  0. ,  0. ,  3. ,  4.2,  1. ],
       [ 0.5,  9. ,  4. ,  0. ,  0. ,  0. ,  4.4,  1. ]])

# 3. Implementation Model

3.1. Implementation Sigmoid function

In [11]:
def sigmoid(M):
    for i in range(0,10000):
        M[i] = 1 / (1 + math.exp(-M[i]))
    return M 

3.2. Implementation Logistics Regression

In [12]:
#Define My Linear Regression 
def MyLinearRegression(pTrain,pYTrain,pTest,pYTest,cross):
    alfa = np.power(10.0,-6.0)
    lamba = np.power(10.0,-8.0)
    parameters = np.empty((7,1))
    parameters[:] = 0.1 * np.random.rand() 
    count = 0
    cycle = 5000
    max = 0
    while(count < cycle):
        cost = 0
        for i in range(1,4):
            currentTrain = pTrain[10000*(i-1):(i*10000),:]
            currentYTrain = pYTrain[10000*(i-1):(i*10000)]
            h = np.matmul(currentTrain,parameters)
            y = sigmoid(h)
            error = y - currentYTrain
            derivate = np.matmul(error.T,currentTrain)
            parameters = parameters - (alfa *derivate.T) - (lamba * parameters)
            loss = -1*np.mean(np.multiply(currentYTrain,np.log(y)) + np.multiply((1-currentYTrain),np.log(1-y)))
        count= count + 1
        if (count % 1000 == 0) or (count == 1):  
            hTest = np.matmul(pTest,parameters)
            hTest = sigmoid(hTest)
            print("----Cross validation ",cross," ----")
            print("----Epoch ",count," ----")
            print("loss: ",loss)
            print("accuracy: ",accuracy_score(pYTest, hTest.round()))
        alfa = alfa + np.power(10.0,-15.0)
    if cross == 3:
        global mrlWeight
        mrlWeight = parameters

In [13]:
### Cross validation 
kFolds = KFold(n_splits=4)
countCross = 1
print(trainData.shape)
for train, test in kFolds.split(trainData):
    trainD = trainData[train,0:7]
    testD = trainData[test,0:7]
    yTrain = trainData[train,7:8]
    yTest = trainData[test,7:8]
    print("------------- My Logistics Regression ---------")
    MyLinearRegression(trainD,yTrain,testD,yTest,countCross)
    countCross = countCross + 1

(40001, 8)
------------- My Logistics Regression ---------
----Cross validation  1  ----
----Epoch  1  ----
loss:  0.6850454588554895
accuracy:  0.5924407559244076
----Cross validation  1  ----
----Epoch  1000  ----
loss:  0.5584875067336665
accuracy:  0.7207279272072793
----Cross validation  1  ----
----Epoch  2000  ----
loss:  0.5584760434692756
accuracy:  0.7208279172082792
----Cross validation  1  ----
----Epoch  3000  ----
loss:  0.5584653781933328
accuracy:  0.7207279272072793
----Cross validation  1  ----
----Epoch  4000  ----
loss:  0.5584554450482067
accuracy:  0.7206279372062794
----Cross validation  1  ----
----Epoch  5000  ----
loss:  0.558446184097401
accuracy:  0.7206279372062794
------------- My Logistics Regression ---------
----Cross validation  2  ----
----Epoch  1  ----
loss:  0.772898366621062
accuracy:  0.5145
----Cross validation  2  ----
----Epoch  1000  ----
loss:  0.5583689934652375
accuracy:  0.7326
----Cross validation  2  ----
----Epoch  2000  ----
loss:  0.

# 4. Test

In [14]:
xTest = testData[:,0:7]
yTest = testData[:,7:8]

In [15]:
mrlWeight.shape

(7, 1)

In [16]:
np.savetxt("parameters.csv", mrlWeight, delimiter=",")

In [18]:
predictTest = np.matmul(xTest,mrlWeight)
predictTest = sigmoid(predictTest)
print("accuracy: ",accuracy_score(yTest, predictTest.round()))

accuracy:  0.7274
