import urllib.request

test_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/testdata.txt"
train_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/training.txt"

In [1]:
test_data_file_name = 'test_data.csv'
train_data_file_name = 'train_data.csv'

test_data_f = urllib.request.urlretrieve(test_data_url,test_data_file_name)
train_data_f = urllib.request.urlretrieve(train_data_url,train_data_file_name)

In [2]:
import pandas as pd

test_data_df = pd.read_csv(test_data_file_name,header=None,delimiter="\t",quoting=3)
test_data_df.columns = ["Text"]

train_data_df = pd.read_csv(train_data_file_name,header=None,delimiter="\t",quoting=3)
train_data_df.columns = ["Sentiment","Text"]

In [3]:
train_data_df.shape

(7086, 2)

In [4]:
test_data_df.shape

(33052, 1)

In [5]:
train_data_df.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [6]:
test_data_df.head()

Unnamed: 0,Text
0,""" I don't care what anyone says, I like Hillar..."
1,have an awesome time at purdue!..
2,"Yep, I'm still in London, which is pretty awes..."
3,"Have to say, I hate Paris Hilton's behavior bu..."
4,i will love the lakers.


In [7]:
train_data_df.Sentiment.value_counts()

1    3995
0    3091
Name: Sentiment, dtype: int64

In [8]:
import numpy as np
np.mean([len(s.split(' ')) for s in train_data_df.Text])

10.886819079875812

In [9]:
import nltk,re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_tokens(tokens,stemmer):
    stemmed = []
    
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    
    return stemmed

def tokenize(text):
    text = re.sub("[^a-zA-Z]"," ",text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens,stemmer)
    return stems

vectorizer = CountVectorizer(analyzer='word',tokenizer=tokenize,lowercase=True,stop_words='english',max_features=85)

In [10]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [11]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

(40138, 85)

In [12]:
vocab = vectorizer.get_feature_names()
vocab

['aaa',
 'amaz',
 'angelina',
 'awesom',
 'beauti',
 'becaus',
 'boston',
 'brokeback',
 'citi',
 'code',
 'cool',
 'cruis',
 'd',
 'da',
 'drive',
 'francisco',
 'friend',
 'fuck',
 'geico',
 'good',
 'got',
 'great',
 'ha',
 'harri',
 'harvard',
 'hate',
 'hi',
 'hilton',
 'honda',
 'imposs',
 'joli',
 'just',
 'know',
 'laker',
 'left',
 'like',
 'littl',
 'london',
 'look',
 'lot',
 'love',
 'm',
 'macbook',
 'make',
 'miss',
 'mission',
 'mit',
 'mountain',
 'movi',
 'need',
 'new',
 'oh',
 'onli',
 'pari',
 'peopl',
 'person',
 'potter',
 'purdu',
 'realli',
 'right',
 'rock',
 's',
 'said',
 'san',
 'say',
 'seattl',
 'shanghai',
 'stori',
 'stupid',
 'suck',
 't',
 'thi',
 'thing',
 'think',
 'time',
 'tom',
 'toyota',
 'ucla',
 've',
 'vinci',
 'wa',
 'want',
 'way',
 'whi',
 'work']

In [13]:
dist = np.sum(corpus_data_features_nd,axis=0)

In [14]:
for tag,count in zip(vocab,dist):
    print(count,tag)

1179 aaa
485 amaz
1765 angelina
3170 awesom
2146 beauti
1694 becaus
2190 boston
2000 brokeback
423 citi
2003 code
481 cool
2031 cruis
439 d
2087 da
433 drive
1926 francisco
477 friend
452 fuck
1085 geico
773 good
571 got
1178 great
776 ha
2094 harri
2103 harvard
4492 hate
794 hi
2086 hilton
2192 honda
1098 imposs
1764 joli
1054 just
896 know
2019 laker
425 left
4080 like
507 littl
2233 london
811 look
421 lot
10334 love
1568 m
1059 macbook
631 make
1098 miss
1101 mission
1340 mit
2081 mountain
1207 movi
1220 need
459 new
551 oh
674 onli
2094 pari
1018 peopl
454 person
2093 potter
1167 purdu
2126 realli
661 right
475 rock
3914 s
495 said
2038 san
627 say
2019 seattl
1189 shanghai
467 stori
2886 stupid
4614 suck
1455 t
1705 thi
662 thing
1524 think
781 time
2117 tom
2028 toyota
2008 ucla
774 ve
2001 vinci
3703 wa
1656 want
932 way
547 whi
512 work


In [15]:
from sklearn.cross_validation import train_test_split

x_train,x_test,y_train,y_test = train_test_split(corpus_data_features_nd[0:len(train_data_df)],train_data_df.Sentiment,train_size=0.85,random_state=1234)



In [16]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model = log_model.fit(x_train,y_train)

In [17]:
y_pred = log_model.predict(x_test)

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       467
          1       0.99      0.98      0.99       596

avg / total       0.98      0.98      0.98      1063



In [19]:
log_model = LogisticRegression()
log_model = log_model.fit(corpus_data_features_nd[0:len(train_data_df)],train_data_df.Sentiment)

test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])

In [20]:
import random 
spl = random.sample(range(len(test_pred)),15)

for text,sentiment in zip(test_data_df.Text[spl],test_pred[spl]):
    print(sentiment,text)

1 I want to love Boston like I loved EL.
0 oh well, UCLA sucks anyways(
0 I hate paris hilton too!
0 By the way I still hate UCLA...
1 I miss Purdue right now..
0 UCLA is stupid, I realized.
1 Now I'm walking around looking like the ugliest person in the world angelina jolie!!!!!!!
0 I HATE LONDON!..
1 I love Angelina Jolie....
1 I want a tour of London on the back of that bike!
1 lol i like allstate through the food..
1 i love seattle so much.
1 i love volkswagen.
0 the jack on a Hyundai sucks major ass.
0 Now MIT seems boring in comparison.


In [None]:
from sklearn.neighbors import KNeighborsClassifier
