In [3]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from html.parser import HTMLParser
#from src import *
import pandas as pd
from datetime import datetime,timedelta
import numpy as np
import re


## Set-up & Cleaning

In [4]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [5]:
df = pd.read_csv('twitter_vader_final.csv', index_col=0)

df = df.drop(['noise', 'word_tokens'], axis = 1)

## add new list of stopwords

In [6]:
common_words = ['toronto', 'go', 'transit', 'gotransit', 'bus', 'train', 'gotrain', 'transportation', 'ttc', 'metrolinx']

stopwords.update(common_words)

In [7]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'bus',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'go',
 'gotrain',
 'gotransit',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'metrolinx',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ou

In [8]:
#strip tweets for TFIDF
def clean_tweets(tweet):
    try:
        clean_tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet) #remove URLs
        clean_tweet = re.sub('[^\x20-\x7E]', ' ', clean_tweet)
        clean_tweet = re.sub('pic.twitter.com/\w*', '', clean_tweet)
        clean_tweet = re.sub('[^a-zA-Z]', ' ', clean_tweet) #remove numbers by matching any character NOT A-Z
        clean_tweet = clean_tweet.lower() #set to lowercase
        word_tokens = word_tokenize(clean_tweet)
        filtered = [word for word in word_tokens if word not in stopwords]
        clean_tweet = (' '.join(filtered)).strip()
        return clean_tweet
    except:
        return ''

    
df['clean_TFIDF'] = df['clean_body'].apply(lambda x: clean_tweets(x))
df = df.loc[df.clean_TFIDF != ''] #remove empty tweets

In [9]:
#create df of tweets without customer service accounts

df['username'] = df.username.str.lower()
df = df.loc[(df.username != 'prestocard') & (df.username != 'ttchelps')]#6484 rows eliminated

In [8]:
df

Unnamed: 0,body,username,retweet,favourite,date,geo,mentions,hashtags,preg_label,Presto_label,...,station_label,go_label,clean_body,vader,compound_sentiment,positive_sentiment,negative_sentiment,neutral_sentiment,sent_cat,clean_TFIDF
0,Good grief. Again? Thanks for the heads up. I’...,saromasbo,0,0,2019-09-19 23:51:38+00:00,,,,0,0,...,1,0,Good grief. Again? Thanks for the heads up. I ...,"{'neg': 0.117, 'neu': 0.66, 'pos': 0.223, 'com...",0.4389,0.223,0.117,0.660,neutral,good grief thanks heads lawrence side head reh...
1,"Hi @PRESTOcard, there are two more gates out o...",benjaminboles,0,2,2019-09-19 23:46:13+00:00,,@PRESTOcard,,0,1,...,1,0,"Hi PRESTOcard, there are two more gates out o...","{'neg': 0.0, 'neu': 0.945, 'pos': 0.055, 'comp...",0.1603,0.055,0.000,0.945,neutral,hi prestocard two gates order bedford park ent...
2,To the guy in the yellow shirt who sprinted al...,sbuchananto,0,5,2019-09-19 23:32:34+00:00,,,,0,1,...,1,0,To the guy in the yellow shirt who sprinted al...,"{'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'comp...",0.4995,0.094,0.000,0.906,neutral,guy yellow shirt sprinted way dundas west brid...
5,@TTChelps I’m visiting and want to buy a day p...,brentiminator,0,0,2019-09-19 21:15:08+00:00,,@TTChelps,,0,1,...,0,0,TTChelps I m visiting and want to buy a day p...,"{'neg': 0.065, 'neu': 0.895, 'pos': 0.04, 'com...",-0.2023,0.040,0.065,0.895,neutral,ttchelps visiting want buy day pass tomorrow f...
6,No pic because I don’t want it on my phone if ...,annakgustafson,0,1,2019-09-19 21:00:58+00:00,,,,0,1,...,0,0,No pic because I don t want it on my phone if ...,"{'neg': 0.058, 'neu': 0.901, 'pos': 0.041, 'co...",-0.1154,0.041,0.058,0.901,neutral,pic want phone cross border woman next subway ...
7,I hope my earbuds and my PRESTO card are happy...,bitcoinwarlock,0,3,2019-09-19 20:54:01+00:00,,,,0,1,...,0,0,I hope my earbuds and my PRESTO card are happy...,"{'neg': 0.0, 'neu': 0.625, 'pos': 0.375, 'comp...",0.7650,0.375,0.000,0.625,positive,hope earbuds presto card happy together wherever
8,If this has happened to me twice this week the...,jonnycargo,0,0,2019-09-19 18:50:19+00:00,,@OC_Transpo @ottawacity @PRESTOcard,#overcharged #money #interest #allthemoney #bu...,0,1,...,0,0,If this has happened to me twice this week the...,"{'neg': 0.05, 'neu': 0.879, 'pos': 0.071, 'com...",0.2263,0.071,0.050,0.879,neutral,happened twice week must many others overcharg...
9,how did u get him on the presto card???,koosblunt,0,0,2019-09-19 18:48:59+00:00,,,,0,1,...,0,0,how did u get him on the presto card???,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.000,0.000,1.000,neutral,u get presto card
10,Lowkey summer Jin is the cutest presto card pi...,jjikookiee,0,6,2019-09-19 18:46:02+00:00,,,,0,1,...,0,0,Lowkey summer Jin is the cutest presto card,"{'neg': 0.0, 'neu': 0.648, 'pos': 0.352, 'comp...",0.5859,0.352,0.000,0.648,positive,lowkey summer jin cutest presto card
11,@OC_Transpo @ottawacity @PRESTOcard whose #ban...,jonnycargo,0,0,2019-09-19 18:41:46+00:00,,@OC_Transpo @ottawacity @PRESTOcard @CBCTheNat...,#bankaccount #interest #faredisputes #corrupti...,0,1,...,0,0,OC_Transpo ottawacity PRESTOcard whose bank...,"{'neg': 0.073, 'neu': 0.785, 'pos': 0.142, 'co...",0.3182,0.142,0.073,0.785,neutral,oc transpo ottawacity prestocard whose bankacc...


In [10]:
def get_TFIDF(dfcol):
    cv=CountVectorizer(ngram_range=(2,3), max_features=20) #ngram: (min gram, max gram) i.e. (1,2)
    word_count_vector=cv.fit_transform(dfcol.unique()) #transforms the data set of words (df_col) into bag of words model
    feature_names = cv.get_feature_names() #words that fit criteria from cv

    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) #produces TF-IDF score 
    tfidf_transformer.fit(word_count_vector) #fits IDF values from above onto our docxterm matrix from above
    tfidf_vector=tfidf_transformer.transform(word_count_vector) #fits TF-IDF values

    # Combining TF-IDF score with the words into a dictionary
    weights = np.asarray(tfidf_vector.mean(axis=0)).ravel().tolist() #create array from tfidf vector
    df_tfidf = pd.DataFrame(weights, index=feature_names, columns=["tfidf"]) #create dataframe
    df_tfidf = df_tfidf.sort_values(by=["tfidf"],ascending=False).reset_index() #sort values by highest tfidf
    df_tfidf.rename(index=str, columns={"index": "term"}, inplace=True) #rename columns

    # Counting the actual number of occurances and appending to dictionary of word
    # with TF-IDF value
    count_list = word_count_vector.toarray().sum(axis=0)
    actual_count_dict = {"term": feature_names, "counts": count_list}
    actual_count_df = pd.DataFrame.from_records(actual_count_dict)
    tfidf_and_count = pd.merge(df_tfidf, actual_count_df, on='term', how='left')
    return tfidf_and_count

## TFIDF by Keyword

In [11]:
#create dfs

df_presto = df.loc[df.Presto_label == 1]
df_ttc = df.loc[df.ttc_label == 1]
df_station = df.loc[df.station_label == 1]
df_go = df.loc[df.go_label == 1]

In [1]:
tfidf_station = get_TFIDF(df_station.clean_TFIDF)

NameError: name 'get_TFIDF' is not defined

In [295]:
#tfidf_df by keywords
tfidf_presto = get_TFIDF(df_presto.clean_TFIDF)
tfidf_presto = tfidf_presto.add_suffix('_presto')
tfidf_ttc = get_TFIDF(df_ttc.clean_TFIDF)
tfidf_ttc = tfidf_ttc.add_suffix('_ttc')
tfidf_station = get_TFIDF(df_station.clean_TFIDF)
tfidf_station = tfidf_station.add_suffix('_station')
tfidf_go = get_TFIDF(df_go.clean_TFIDF)
tfidf_go = tfidf_go.add_suffix('_go')
tfidf_keywords = pd.concat([tfidf_presto, tfidf_ttc, tfidf_station, tfidf_go], axis=1)
tfidf_keywords

Unnamed: 0,term_presto,tfidf_presto,counts_presto,term_ttc,tfidf_ttc,counts_ttc,term_station,tfidf_station,counts_station,term_go,tfidf_go,counts_go
0,presto card,0.261673,4866,presto card,0.05969,1248,union station,0.082746,1804,union station,0.025933,257
1,presto cards,0.024162,401,upwarp caused,0.025935,484,kitchener line,0.033395,687,kitchener line,0.014084,130
2,customer service,0.017341,294,topoli onpoli,0.017902,343,presto card,0.021368,452,presto card,0.01365,131
3,monthly pass,0.017076,321,onpoli topoli,0.015936,472,eglinton crosstown,0.019315,467,lakeshore east,0.013579,126
4,use presto,0.015232,262,ttchelps prestocard,0.011757,228,barrie line,0.015311,311,ride free,0.011821,139
5,ttchelps prestocard,0.014023,228,subway station,0.010593,215,milton line,0.013328,276,lakeshore west,0.011629,111
6,oc transpo,0.013974,234,prestocard ttchelps,0.00972,192,lakeshore west,0.012709,274,niagara falls,0.010198,102
7,onpoli topoli,0.012528,309,monthly pass,0.008935,195,rush hour,0.012526,278,wi fi,0.009884,98
8,tap presto,0.012411,212,rt bconnolly,0.008795,343,lakeshore east,0.011487,245,rush hour,0.009453,91
9,presto machines,0.011735,196,fare integration,0.008041,155,leslie woo,0.010684,210,barrie line,0.008595,80


In [341]:
df_presto_pos = df.loc[(df.Presto_label == 1) & (df.sent_cat == 'positive')]
df_presto_neg = df.loc[(df.Presto_label == 1) & (df.sent_cat == 'negative')]

get_TFIDF(df_presto_pos.clean_TFIDF)

Unnamed: 0,term,tfidf,counts
0,presto card,0.269489,928
1,customer service,0.021889,62
2,credit card,0.019972,63
3,presto cards,0.019964,69
4,ride free,0.016205,50
5,monthly pass,0.016087,52
6,use presto,0.015115,55
7,get presto,0.013315,41
8,prestocard ttchelps,0.011756,34
9,oc transpo,0.011585,36


In [338]:
tfidf_keywords.to_csv('twitter_tfidf_keywords_bitrigrams.csv')

## get TFIDF by stn

In [12]:
station_names = pd.read_csv('station_names.csv')
station_names = list(station_names.Station.str.lower())

#filter out station names from df_stations
df_station = df.loc[df.station_label == 1]

def get_stations(x):
    matches = [word for word in station_names if word in x]
    station_str = ", ".join(matches)
    return station_str

df_station['stations'] = df_station.clean_TFIDF.apply(lambda x: get_stations(x))


In [42]:
stations_list = df_station.stations.tolist()
stations_list_split = []
for stn in stations_list:
    stns = stn.split(', ')
    stations_list_split.extend(stns)

#print(stations_list_split)

stns_df = pd.DataFrame(stations_list_split)
stns_df[0].value_counts()

union                   12188
kitchener                4612
eglinton                 3636
king                     3387
milton                   2399
oshawa                   1926
bloor                    1614
hamilton                 1460
finch                    1368
brampton                 1104
barrie                    972
scarborough               805
danforth                  766
yonge                     761
guelph                    746
burlington                656
bramalea                  649
exhibition                629
bay                       569
kennedy                   542
oakville                  534
aldershot                 533
kipling                   526
pickering                 518
university                513
whitby                    465
stouffville               451
weston                    389
clarkson                  382
lawrence                  380
                        ...  
rideau                     15
st andrew                  14
glencairn 

### From top 5 mentioned stations, remove name & get TFIDFs for tweets with stn names

In [48]:
union_stn = df_station[df_station.clean_TFIDF.str.contains('union')]
union_stn['clean_TFIDF'] = union_stn.clean_TFIDF.apply(lambda x: re.sub('union', '', x))
uniontfidf = get_TFIDF(union_stn.clean_TFIDF)
uniontfidf = uniontfidf.add_suffix('_union')

kitchener_stn = df_station[df_station.clean_TFIDF.str.contains('kitchener')]
kitchener_stn['clean_TFIDF'] = kitchener_stn.clean_TFIDF.apply(lambda x: re.sub('kitchener', '', x))
kitchenertfidf = get_TFIDF(kitchener_stn.clean_TFIDF)
kitchenertfidf = kitchenertfidf.add_suffix('_kitchener')

eglinton_stn = df_station[df_station.clean_TFIDF.str.contains('eglinton')]
eglinton_stn['clean_TFIDF'] = eglinton_stn.clean_TFIDF.apply(lambda x: re.sub('eglinton', '', x))
eglintontfidf = get_TFIDF(eglinton_stn.clean_TFIDF)
eglintontfidf = eglintontfidf.add_suffix('_eglinton')

king_stn = df_station[df_station.clean_TFIDF.str.contains('king')]
king_stn['clean_TFIDF'] = king_stn.clean_TFIDF.apply(lambda x: re.sub('king', '', x))
kingtfidf = get_TFIDF(king_stn.clean_TFIDF)
kingtfidf = kingtfidf.add_suffix('_king')

milton_stn = df_station[df_station.clean_TFIDF.str.contains('milton')]
milton_stn['clean_TFIDF'] = milton_stn.clean_TFIDF.apply(lambda x: re.sub('milton', '', x))
miltontfidf = get_TFIDF(milton_stn.clean_TFIDF)
miltontfidf = miltontfidf.add_suffix('_milton')

tfidf_top5stations = pd.concat([uniontfidf, kitchenertfidf, eglintontfidf, kingtfidf, miltontfidf], axis=1)
tfidf_top5stations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#in

Unnamed: 0,term_union,tfidf_union,counts_union,term_kitchener,tfidf_kitchener,counts_kitchener,term_eglinton,tfidf_eglinton,counts_eglinton,term_king,tfidf_king,counts_king,term_milton,tfidf_milton,counts_milton
0,lakeshore west,0.027298,145,union station,0.03848,63,crosstown lrt,0.098421,185,par lot,0.064167,200,ha lrt,0.045903,103
1,rush hour,0.02561,131,service line,0.031946,70,crosstownlrt construction,0.025398,52,union station,0.058639,170,union station,0.038115,90
2,lakeshore east,0.025092,122,rush hour,0.026912,45,crosstown construction,0.022132,35,kitchener line,0.027596,94,ha centre,0.036031,82
3,presto card,0.022674,116,mount pleasant,0.025782,43,light rail,0.018123,31,presto card,0.026486,73,service ha,0.026398,59
4,station terminal,0.018267,92,trains line,0.017051,27,rt crosstownto,0.01668,39,eglinton crosstown,0.015431,41,ha union,0.021855,48
5,pearson express,0.015254,72,delays line,0.016327,24,building crosstown,0.015695,28,par spots,0.014961,43,day service,0.01842,44
6,customer service,0.012315,64,town hall,0.014779,22,st clair,0.013552,38,par garage,0.014087,40,rush hour,0.016235,36
7,kitchener line,0.010759,55,rt gotransitkt,0.01368,29,victoria park,0.012045,24,rush hour,0.013595,41,lakeshore west,0.015367,37
8,york concourse,0.009911,51,pm express,0.013137,21,crosstown vehicles,0.011988,19,loo forward,0.013054,37,ha station,0.014431,31
9,la gare,0.009745,114,two way,0.01294,34,close bathurst,0.011729,19,barrie line,0.010785,29,union ha,0.014361,30


In [49]:
tfidf_top5stations.to_csv('Twitter_TFIDF_top5stns.csv')

## TFIDF by Month

In [297]:
#convert dates to date objects
df.date = df.date.map(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S+00:00').date())

df['month_yr'] = df.date.apply(lambda x: str(x.month)+'-'+str(x.year))

#df.date.value_counts() #count occurrences of each date

In [298]:
def add_tfidf_col(dfcol,suffix):
    tfidf_df = get_TFIDF(dfcol)
    tfidf_df = tfidf_df.add_suffix(suffix)
    return tfidf_df

In [299]:
all_time = df.month_yr.unique()
tfidf_full = pd.DataFrame()

for month in all_time:
    df_monthly = df.loc[df.month_yr == month]
    tfidf_monthly = get_TFIDF(df_monthly.clean_TFIDF)
    tfidf_monthly['month_yr'] = month
    tfidf_all_month = tfidf_monthly
    
    #getting TFIDF for each sentiment per month
    df_pos = df_monthly.loc[df_monthly.sent_cat == 'positive']
    if len(df_pos) > 1:
        tfidf_pos = add_tfidf_col(df_pos.clean_TFIDF,'_positive')
        tfidf_all_month = tfidf_all_month.join(tfidf_pos, how='outer')
    
    df_neu = df_monthly.loc[df_monthly.sent_cat == 'neutral']
    if len(df_neu) > 1:
        tfidf_neu = add_tfidf_col(df_neu.clean_TFIDF,'_neutral')
        tfidf_all_month = tfidf_all_month.join(tfidf_neu, how='outer')
    
    df_neg = df_monthly.loc[df_monthly.sent_cat == 'negative']
    if len(df_neg) > 1:
        tfidf_neg = add_tfidf_col(df_neg.clean_TFIDF,'_negative')
        tfidf_all_month = tfidf_all_month.join(tfidf_neg, how='outer')
    tfidf_full = pd.concat([tfidf_full, tfidf_all_month], axis=0)
    
tfidf_full


Unnamed: 0,term,tfidf,counts,month_yr,term_positive,tfidf_positive,counts_positive,term_neutral,tfidf_neutral,counts_neutral,term_negative,tfidf_negative,counts_negative
0,upwarp caused,0.041923,484,9-2019,presto card,0.031932,78,upwarp caused,0.059273,473,presto card,0.038905,52
1,presto card,0.032591,398,9-2019,wi fi,0.015118,63,presto card,0.032304,268,union station,0.016138,40
2,regular service,0.008979,185,9-2019,good morning,0.013789,33,regular service,0.012284,183,fare evasion,0.015186,21
3,service resumed,0.008590,180,9-2019,free wi fi,0.011501,51,service resumed,0.011836,179,shuttle buses,0.014440,20
4,union station,0.008544,111,9-2019,free wi,0.011501,51,regular service resumed,0.011612,177,en raison,0.013522,50
5,regular service resumed,0.008435,178,9-2019,free wifi,0.011141,29,pas cher,0.010013,117,delayed minutes,0.012371,17
6,pas cher,0.007525,127,9-2019,pt win,0.010074,34,de la,0.008233,74,customer service,0.011470,17
7,wi fi,0.007189,88,9-2019,well done,0.008316,19,union station,0.006790,58,de minutes,0.009915,40
8,due collision,0.006818,79,9-2019,us know,0.007415,18,due collision,0.006403,65,le gobus,0.009009,29
9,de la,0.006732,93,9-2019,would like,0.006901,16,delays minutes,0.006310,73,retard de,0.008084,33


In [300]:
tfidf_full.to_csv('Twitter_TFIDF_sentiment_monthly_bitrigrams.csv')

## TFIDF by Hashtags

In [221]:
hashtags = pd.read_csv('chosen hashtags.csv')

In [225]:
hashtags['Hashtags']=hashtags.Hashtags.str.lower()
hashtags

Unnamed: 0,Hashtags
0,#ttc
1,#metrolinx
2,#presto
3,#gotransit
4,#gotrain
5,#gobus
6,#privatizationfailure
7,#upexpress
8,#collusion
9,#farecollectionprivatizationfailure


In [190]:
with_ht = df[df.hashtags.apply(type) != float] #pull rows with HT
with_ht['hashtags']=with_ht.hashtags.str.lower()
ht_list = with_ht.hashtags.tolist() #make into list

ht_list_split = []
for ht in ht_list: #iterate through list and extend list so that each element = 1 hashtag
    h = ht.split()
    ht_list_split.extend(h)

ht_df = pd.DataFrame(ht_list_split) #pass to df
ht_counts = ht_df[0].value_counts() #get counts easily

In [301]:
with_ht = df[df.hashtags.apply(type) != float] #pull rows with HT
with_ht['hashtags']=with_ht.hashtags.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [228]:
def getTFIDF(dfcol):
    cv=CountVectorizer(ngram_range=(2,3), max_features=20) #ngram: (min gram, max gram) i.e. (1,2)
    word_count_vector=cv.fit_transform(dfcol.unique()) #transforms the data set of words (df_col) into bag of words model
    feature_names = cv.get_feature_names() #words that fit criteria from cv

    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) #produces TF-IDF score 
    tfidf_transformer.fit(word_count_vector) #fits IDF values from above onto our docxterm matrix from above
    tfidf_vector=tfidf_transformer.transform(word_count_vector) #fits TF-IDF values

    # Combining TF-IDF score with the words into a dictionary
    weights = np.asarray(tfidf_vector.mean(axis=0)).ravel().tolist() #create array from tfidf vector
    df_tfidf = pd.DataFrame(weights, index=feature_names, columns=["tfidf"]) #create dataframe
    df_tfidf = df_tfidf.sort_values(by=["tfidf"],ascending=False).reset_index() #sort values by highest tfidf
    df_tfidf.rename(index=str, columns={"index": "term"}, inplace=True) #rename columns

    # Counting the actual number of occurances and appending to dictionary of word
    # with TF-IDF value
    count_list = word_count_vector.toarray().sum(axis=0)
    actual_count_dict = {"term": feature_names, "counts": count_list}
    actual_count_df = pd.DataFrame.from_records(actual_count_dict)
    tfidf_and_count = pd.merge(df_tfidf, actual_count_df, on='term', how='left')
    return tfidf_and_count

In [302]:
ht_ttc = with_ht[with_ht.hashtags.str.contains('ttc')]
ht_metrolinx = with_ht[with_ht.hashtags.str.contains('metrolinx')]
ht_Presto = with_ht[with_ht.hashtags.str.contains('presto')]
ht_GoTransit = with_ht[with_ht.hashtags.str.contains('gotransit')]
ht_Privitization = with_ht[with_ht.hashtags.str.contains('privitizationfailure')]
ht_upexpress = with_ht[with_ht.hashtags.str.contains('upexpress')]
ht_collusion = with_ht[with_ht.hashtags.str.contains('collusion')]
ht_Accenture = with_ht[with_ht.hashtags.str.contains('accenture')]
ht_PrestoFail = with_ht[with_ht.hashtags.str.contains('prestofail')]
ht_Fail = with_ht[with_ht.hashtags.str.contains('fail')]
ht_eglinton = with_ht[with_ht.hashtags.str.contains('eglintoncrosstown')]
ht_scarb = with_ht[with_ht.hashtags.str.contains('scarborough')]
ht_lrt = with_ht[with_ht.hashtags.str.contains('lrt')]




In [310]:
ht_Presto = with_ht[with_ht.hashtags.str.contains('presto')]
ht_Presto_pos = ht_Presto.loc[ht_Presto.sent_cat == 'positive']
ht_Presto_neg = ht_Presto.loc[ht_Presto.sent_cat == 'negative']

In [345]:
getTFIDF(ht_upexpress.clean_TFIDF)

Unnamed: 0,term,tfidf,counts
0,onpoli topoli,0.178218,100
1,rt bconnolly,0.152105,110
2,bconnolly onpoli,0.121043,72
3,rt bconnolly onpoli,0.121043,72
4,presto upexpress,0.109678,73
5,bconnolly onpoli topoli,0.099439,56
6,scarborough scarbto,0.099426,70
7,eglintoncrosstown finchwest,0.090004,66
8,upexpress eglintoncrosstown,0.086594,65
9,presto upexpress eglintoncrosstown,0.075254,55


## Stations TFIDF by sentiment

In [304]:
df_station_pos = df.loc[(df.station_label == 1) & (df.sent_cat == 'positive')]
df_station_neu = df.loc[(df.station_label == 1) & (df.sent_cat == 'neutral')]
df_station_neg = df.loc[(df.station_label == 1) & (df.sent_cat == 'negative')]

tfidf_station_pos = getTFIDF(df_station_pos.clean_TFIDF)
tfidf_station_neu = getTFIDF(df_station_neu.clean_TFIDF)
tfidf_station_neg = getTFIDF(df_station_neg.clean_TFIDF)
tfidf_station_pos = tfidf_station_pos.add_suffix('_stn_pos')
tfidf_station_neu = tfidf_station_neu.add_suffix('_stn_neu')
tfidf_station_neg = tfidf_station_neg.add_suffix('_stn_neg')
tfidf_stn_sentiment = pd.concat([tfidf_station_pos, tfidf_station_neu, tfidf_station_neg], axis = 1)
tfidf_stn_sentiment

Unnamed: 0,term_stn_pos,tfidf_stn_pos,counts_stn_pos,term_stn_neu,tfidf_stn_neu,counts_stn_neu,term_stn_neg,tfidf_stn_neg,counts_stn_neg
0,union station,0.091705,364,union station,0.081146,1225,union station,0.073396,215
1,kitchener line,0.035359,151,kitchener line,0.029525,418,kitchener line,0.046717,118
2,leslie woo,0.024703,106,eglinton crosstown,0.021904,364,presto card,0.026647,68
3,eglinton crosstown,0.020945,78,presto card,0.021103,310,rush hour,0.018804,52
4,presto card,0.01927,75,barrie line,0.013891,197,barrie line,0.018719,47
5,barrie line,0.018375,67,lakeshore west,0.01381,207,lakeshore east,0.018511,47
6,mount pleasant,0.016986,74,milton line,0.013211,189,milton line,0.018122,46
7,good morning,0.013414,51,lakeshore east,0.012546,186,en raison,0.012836,83
8,relief line,0.013008,54,rush hour,0.011774,180,la gare,0.01024,92
9,rush hour,0.011243,46,finch west,0.011329,159,de minutes,0.009376,74


In [306]:
tfidf_stn_sentiment.to_csv('twitter_TFIDF_station_sentiment_bitrigrams.csv')