In [1]:
import pandas as pd
import numpy as np
import bz2

In [2]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

train_lables, train_texts = get_labels_and_texts(r"C:\Users\Manoj\Downloads\train.ft.txt.bz2\train.ft.txt.bz2")
test_lables, test_texts = get_labels_and_texts(r"C:\Users\Manoj\Downloads\test.ft.txt.bz2\test.ft.txt.bz2")


In [4]:
train_lables[0]

1

In [5]:
train_texts[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [6]:
# Extracting top 500 outof 50,000 since we are working on logistic regression

train_lables = train_lables[0:500]
train_texts = train_texts[0:500]

In [31]:
# text preprocessing: vomiting all special characters, punctuation marks(,.) and explanatory marks and turning all into lower case
import re

NON_ALPHANUM = re.compile(r'[\W]')

NON_ASCII = re.compile(r'[^a-z0-1\s]')

def normalise_texts(texts):
    normalise_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalise_texts.append(no_non_ascii)
    return normalise_texts

train_texts = normalise_texts(train_texts)
test_texts = normalise_texts(test_texts)

In [29]:
train_texts[0]

'stuning even for the non gamer  this sound track was beautiful  it paints the senery in your mind so well i would recomend it even to people who hate vid  game music  i have played the game chrono cross but out of all of the games i have ever played it has the best music  it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  it would impress anyone who cares to listen    '

In [33]:
# Vectroizing all the text to make it as machine understandable : basic task in natural language processing

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary = True)
cv.fit(train_texts)
X = cv.transform(train_texts)
X_test = cv.transform(test_texts)

In [35]:
print(X_test)

  (0, 213)	1
  (0, 282)	1
  (0, 398)	1
  (0, 503)	1
  (0, 506)	1
  (0, 511)	1
  (0, 525)	1
  (0, 561)	1
  (0, 761)	1
  (0, 1305)	1
  (0, 1698)	1
  (0, 1701)	1
  (0, 1857)	1
  (0, 1864)	1
  (0, 1973)	1
  (0, 2090)	1
  (0, 2156)	1
  (0, 2193)	1
  (0, 2279)	1
  (0, 2288)	1
  (0, 2331)	1
  (0, 2525)	1
  (0, 2678)	1
  (0, 2681)	1
  (0, 2689)	1
  :	:
  (399999, 3817)	1
  (399999, 3987)	1
  (399999, 4059)	1
  (399999, 4194)	1
  (399999, 4363)	1
  (399999, 4413)	1
  (399999, 4667)	1
  (399999, 4773)	1
  (399999, 4888)	1
  (399999, 5087)	1
  (399999, 5090)	1
  (399999, 5094)	1
  (399999, 5100)	1
  (399999, 5104)	1
  (399999, 5107)	1
  (399999, 5125)	1
  (399999, 5186)	1
  (399999, 5191)	1
  (399999, 5455)	1
  (399999, 5540)	1
  (399999, 5609)	1
  (399999, 5610)	1
  (399999, 5620)	1
  (399999, 5649)	1
  (399999, 5759)	1


In [49]:
# Building model and testing its accuracy with train and val dataset which is splitted from train dataset only

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, train_lables, train_size = 0.75)

#for c in [0.01, 0.05, 0.25, 0.5, 1]:
#    lr = LogisticRegression(C = c)
 #   lr.fit(X_train, y_train)
  #  print(c, accuracy_score(y_val, lr.predict(X_val)))
lr = LogisticRegression()

lr.fit(X_train, y_train)

In [51]:
lr.predict(X_test[29]), test_lables[29]

(array([0]), 0)