# IR_Project

# Import Library

In [1]:
import numpy as np
import pandas as pd
import nltk as nltk

# Load Data

In [2]:
data = pd.read_csv("iphone6-negative.csv",encoding='latin-1')
data.head(5)

Unnamed: 0,TweetID,User_ID,Text,Sentiment
0,6.47399e+17,Rionagh,I've had an IPhone for like 3 years and I've n...,negative
1,6.47399e+17,your highness,I dont need the new iphone but I want it :(,negative
2,6.47399e+17,Little Liggins,fuck the iPhone 6s cus I'm not getting one :(,negative
3,6.474e+17,,Hopefully will be able to get my iPhone 6s tod...,negative
4,6.474e+17,Kenny Tosh,@clydesdalebank I've just bought a new iPhone ...,negative


In [3]:
data = data['Text']
data.head()

0    I've had an IPhone for like 3 years and I've n...
1          I dont need the new iphone but I want it :(
2        fuck the iPhone 6s cus I'm not getting one :(
3    Hopefully will be able to get my iPhone 6s tod...
4    @clydesdalebank I've just bought a new iPhone ...
Name: Text, dtype: object

# Tokenizer

In [4]:
from nltk.tokenize import word_tokenize

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

data_tokenize = data.apply(lambda x: tokenize(x.lower()))

data_tokenize.head()

0    [i, 've, had, an, iphone, for, like, 3, years,...
1    [i, dont, need, the, new, iphone, but, i, want...
2    [fuck, the, iphone, 6s, cus, i, 'm, not, getti...
3    [hopefully, will, be, able, to, get, my, iphon...
4    [@, clydesdalebank, i, 've, just, bought, a, n...
Name: Text, dtype: object

# Stop word

In [5]:
from nltk.corpus import stopwords
stopwords_En = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords_En]
    return text

data_stop_word = data_tokenize.apply(lambda x: remove_stopwords(x))

data_stop_word.head()

0    ['ve, iphone, like, 3, years, 've, never, done...
1                [dont, need, new, iphone, want, :, (]
2      [fuck, iphone, 6s, cus, 'm, getting, one, :, (]
3    [hopefully, able, get, iphone, 6s, today, :, (...
4    [@, clydesdalebank, 've, bought, new, iphone, ...
Name: Text, dtype: object

# Stemming

In [23]:
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data_stemming = data_stop_word.apply(lambda x: stemming(x))

data_stemming.head()

0    ['ve, iphon, like, 3, year, 've, never, done, ...
1                 [dont, need, new, iphon, want, :, (]
2            [fuck, iphon, 6s, cu, 'm, get, one, :, (]
3    [hope, abl, get, iphon, 6s, today, :, (, plane...
4    [@, clydesdalebank, 've, bought, new, iphon, c...
Name: Text, dtype: object

# Term-Document Matrix

In [7]:
def term(term_text):
    text = {term for doc in range(0,5) for term in data_stemming[doc]}
    return text

unique_terms = term(data_stemming)
unique_terms

{"'m",
 "'ve",
 '(',
 '.',
 '/',
 '11:30',
 '3',
 '6s',
 ':',
 '?',
 '@',
 'abl',
 'appl',
 'bought',
 'ca',
 'clydesdalebank',
 'cu',
 'done',
 'dont',
 'fuck',
 'get',
 'hate',
 'hope',
 'iphon',
 'life',
 'like',
 "n't",
 'need',
 'never',
 'new',
 'one',
 'pay',
 'plane',
 'support',
 'today',
 'use',
 'want',
 'year'}

In [8]:
doc_term_matrix = {}

for term in unique_terms:
    doc_term_matrix[term] = []
    
    for doc in range(0,5):
        if term in data_stemming[doc]:
            doc_term_matrix[term].append(1)
        else: doc_term_matrix[term].append(0)

doc_term_matrix

{'hope': [0, 0, 0, 1, 0],
 'ca': [0, 0, 0, 0, 1],
 'appl': [0, 0, 0, 0, 1],
 'iphon': [1, 1, 1, 1, 1],
 "n't": [0, 0, 0, 0, 1],
 '?': [0, 0, 0, 0, 1],
 'new': [0, 1, 0, 0, 1],
 '(': [1, 1, 1, 1, 1],
 '@': [0, 0, 0, 0, 1],
 "'ve": [1, 0, 0, 0, 1],
 'one': [0, 0, 1, 0, 0],
 'pay': [0, 0, 0, 0, 1],
 'bought': [0, 0, 0, 0, 1],
 'abl': [0, 0, 0, 1, 0],
 'done': [1, 0, 0, 0, 0],
 'dont': [0, 1, 0, 0, 0],
 'plane': [0, 0, 0, 1, 0],
 'clydesdalebank': [0, 0, 0, 0, 1],
 '.': [0, 0, 0, 0, 1],
 'support': [0, 0, 0, 0, 1],
 "'m": [0, 0, 1, 0, 0],
 'never': [1, 0, 0, 0, 0],
 'want': [0, 1, 0, 0, 0],
 'get': [0, 0, 1, 1, 1],
 '6s': [0, 0, 1, 1, 0],
 'life': [1, 0, 0, 0, 0],
 ':': [1, 1, 1, 1, 1],
 'use': [0, 0, 0, 0, 1],
 'today': [0, 0, 0, 1, 0],
 'year': [1, 0, 0, 0, 0],
 '11:30': [0, 0, 0, 1, 0],
 '3': [1, 0, 0, 0, 0],
 'cu': [0, 0, 1, 0, 0],
 '/': [0, 0, 0, 1, 0],
 'fuck': [0, 0, 1, 0, 0],
 'need': [0, 1, 0, 0, 0],
 'like': [1, 0, 0, 1, 0],
 'hate': [1, 0, 0, 0, 0]}

In [9]:
docs_array = np.array(data_stemming[:5], dtype='object')
print(docs_array)

v1 = np.array(doc_term_matrix['iphon'])    
v2 = np.array(doc_term_matrix['bought'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

[list(["'ve", 'iphon', 'like', '3', 'year', "'ve", 'never', 'done', ':', '(', 'hate', 'life'])
 list(['dont', 'need', 'new', 'iphon', 'want', ':', '('])
 list(['fuck', 'iphon', '6s', 'cu', "'m", 'get', 'one', ':', '('])
 list(['hope', 'abl', 'get', 'iphon', '6s', 'today', ':', '(', 'plane', 'get', 'like', '11:30', ':', '/'])
 list(['@', 'clydesdalebank', "'ve", 'bought', 'new', 'iphon', 'ca', "n't", 'use', 'appl', 'pay', "n't", 'support', '.', 'get', 'use', '?', ':', '('])]
[1 1 1 1 1]
[0 0 0 0 1]
-------
[0 0 0 0 1]


In [10]:
[doc for doc in v3 * docs_array if doc]

[['@',
  'clydesdalebank',
  "'ve",
  'bought',
  'new',
  'iphon',
  'ca',
  "n't",
  'use',
  'appl',
  'pay',
  "n't",
  'support',
  '.',
  'get',
  'use',
  '?',
  ':',
  '(']]

# Inverted Index

In [11]:
for i, doc in enumerate(data_stemming[:5]):
    print(i)
    print(doc)

0
["'ve", 'iphon', 'like', '3', 'year', "'ve", 'never', 'done', ':', '(', 'hate', 'life']
1
['dont', 'need', 'new', 'iphon', 'want', ':', '(']
2
['fuck', 'iphon', '6s', 'cu', "'m", 'get', 'one', ':', '(']
3
['hope', 'abl', 'get', 'iphon', '6s', 'today', ':', '(', 'plane', 'get', 'like', '11:30', ':', '/']
4
['@', 'clydesdalebank', "'ve", 'bought', 'new', 'iphon', 'ca', "n't", 'use', 'appl', 'pay', "n't", 'support', '.', 'get', 'use', '?', ':', '(']


In [12]:
inverted_index = {}

for i, doc in enumerate(data_stemming[:5]):
    for term in doc:
        if term in inverted_index:
            inverted_index[term].add(i)
        else: inverted_index[term] = {i}

inverted_index

{"'ve": {0, 4},
 'iphon': {0, 1, 2, 3, 4},
 'like': {0, 3},
 '3': {0},
 'year': {0},
 'never': {0},
 'done': {0},
 ':': {0, 1, 2, 3, 4},
 '(': {0, 1, 2, 3, 4},
 'hate': {0},
 'life': {0},
 'dont': {1},
 'need': {1},
 'new': {1, 4},
 'want': {1},
 'fuck': {2},
 '6s': {2, 3},
 'cu': {2},
 "'m": {2},
 'get': {2, 3, 4},
 'one': {2},
 'hope': {3},
 'abl': {3},
 'today': {3},
 'plane': {3},
 '11:30': {3},
 '/': {3},
 '@': {4},
 'clydesdalebank': {4},
 'bought': {4},
 'ca': {4},
 "n't": {4},
 'use': {4},
 'appl': {4},
 'pay': {4},
 'support': {4},
 '.': {4},
 '?': {4}}

In [13]:
posting_list = inverted_index['new']
posting_list

{1, 4}

In [14]:
def or_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1]) #2
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result

In [15]:
pl_1 = list(inverted_index['get'])
pl_2 = list(inverted_index['want'])
or_postings(pl_1, pl_2) 

[1, 2, 3, 4]

In [16]:
def and_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

In [17]:
pl_1 = list(inverted_index['iphon'])
pl_2 = list(inverted_index['bought'])
and_postings(pl_1, pl_2)

[4]

# TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer( ngram_range=(1,1))
features_tfidf = tfidf.fit_transform(term for index in range(0,5) for term in data_stemming[index])
print(features_tfidf.shape)
print('Sparse Matrix :\n', features_tfidf)

features_tfidf2 = pd.DataFrame(features_tfidf.toarray())
features_tfidf2

(61, 30)
Sparse Matrix :
   (0, 27)	1.0
  (1, 15)	1.0
  (2, 17)	1.0
  (4, 29)	1.0
  (5, 27)	1.0
  (6, 19)	1.0
  (7, 9)	1.0
  (10, 13)	1.0
  (11, 16)	1.0
  (12, 10)	1.0
  (13, 18)	1.0
  (14, 20)	1.0
  (15, 15)	1.0
  (16, 28)	1.0
  (19, 11)	1.0
  (20, 15)	1.0
  (21, 2)	1.0
  (22, 8)	1.0
  (24, 12)	1.0
  (25, 21)	1.0
  (28, 14)	1.0
  (29, 3)	1.0
  (30, 12)	1.0
  (31, 15)	1.0
  (32, 2)	1.0
  (33, 25)	1.0
  (36, 23)	1.0
  (37, 12)	1.0
  (38, 17)	1.0
  (39, 1)	0.7071067811865475
  (39, 0)	0.7071067811865475
  (43, 7)	1.0
  (44, 27)	1.0
  (45, 5)	1.0
  (46, 20)	1.0
  (47, 15)	1.0
  (48, 6)	1.0
  (50, 26)	1.0
  (51, 4)	1.0
  (52, 22)	1.0
  (54, 24)	1.0
  (56, 12)	1.0
  (57, 26)	1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
feature_names = tfidf.get_feature_names()
feature_names



['11',
 '30',
 '6s',
 'abl',
 'appl',
 'bought',
 'ca',
 'clydesdalebank',
 'cu',
 'done',
 'dont',
 'fuck',
 'get',
 'hate',
 'hope',
 'iphon',
 'life',
 'like',
 'need',
 'never',
 'new',
 'one',
 'pay',
 'plane',
 'support',
 'today',
 'use',
 've',
 'want',
 'year']

In [20]:
dense = features_tfidf.todense()
dense

matrix([[0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.head()

Unnamed: 0,11,30,6s,abl,appl,bought,ca,clydesdalebank,cu,done,...,new,one,pay,plane,support,today,use,ve,want,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
