In [None]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [1]:
path = "/home/tanglinh/Desktop/CSC299/"

#------------------
# LOAD DATA
#------------------
tweets = path + "Musical Tweet Data/tweet.txt"
tracks = path + "Musical Tweet Data/track.txt"
artists = path + "Musical Tweet Data/artists.txt"
mbartists = path + "mbdump/artist"

#------------------
# PREPROCESS DATA
#------------------
#Create a data frame for tweet.txt and location.txt
tweets = pd.read_csv(tweets, header = None, sep='\t')
tweets.columns = ['file_id', 'tweet_id', 'user_id', 'artistId', 'trackId', 'datetime', 'weekday',
                  'longitude', 'latitude']

tracks = pd.read_csv(tracks, header = 0, sep='\t').drop(0)

mbartists = pd.read_csv(mbartists, header = None, sep='\t', low_memory=False)
mbartists.columns = ['id', 'mbid', 'artists_name', 'sort_name', 'begin_date_year', 'begin_date_month', 
                     'begin_date_day', 'end_date_year', 'end_date_month', 'end_date_day', 'artist_type', 
                     'artist_area', 'artist_gender', 'comment', 'edits_pending', 'last_updated', 'ended',
                     'artist_born', 'artist_died']
mbartists = mbartists.drop(labels = ['id', 'sort_name', 'begin_date_year', 'begin_date_month','begin_date_day', 
                                     'end_date_year', 'end_date_month', 'end_date_day','comment', 'edits_pending', 
                                     'last_updated', 'ended', 'artist_born', 'artist_died', 'artists_name'], axis=1)

artists = pd.read_csv(artists, header = 0, sep='\t')
artists = pd.merge(artists, mbartists, left_on='artist_mbid', right_on='mbid')
artists = artists.drop(labels =['artist_mbid', 'mbid'], axis=1)

#create the metadata frame: 
data = pd.merge(tweets, tracks, left_on=['trackId', 'artistId'],right_on=['track_id', 'track_artistId'])
data = pd.merge(data, artists, left_on='artistId', right_on='artist_id')

#extract months and timestamp info from datetime strings
data['datetime'] = pd.to_datetime(data['datetime'])
data['tweet_month'] = data['datetime'].dt.month
data['tweet_year'] = data['datetime'].dt.year
data['tweet_date'] = data['datetime'].dt.date

#cluster hours in the day: 5-12:Morning, 13-17: Afternoon, 18-21: Evening, 22-4: Night
hours = data['datetime'].dt.hour
data['tweet_time'] = pd.cut(hours-5+24 *(hours<5),bins=[0, 8, 12, 17, 25], 
                            labels=['Morning','Afternoon','Evening','Night'],right=False)

#cluster days in the week: Mon-Fri:weekday, Sat + Sun = Weekend
data['tweet_day'] = pd.cut(data['weekday'],bins=[0, 5, 7], labels=['weekday', 'weekend'],right=False)

#drop unneccesary columns
data = data.drop(labels = ['longitude', 'latitude', 'weekday', 'trackId','track_artistId', 
                           'artistId', 'file_id', 'datetime'], axis=1).drop_duplicates()
data.to_csv(index=False, path_or_buf= path + "mydata.csv")

In [None]:
#---------------------------
# DATA VISUALIZATION
#---------------------------

    #creat a dataframe of 2 columns: a variable and number of tweets posted on that day in the US
    #context
day_df = data.groupby('tweet_day')['tweet_id'].count().reset_index(name='tweet_count')
month_df = data.groupby('tweet_month')['tweet_id'].count().reset_index(name='tweet_count')
time_df = data.groupby('tweet_time')['tweet_id'].count().reset_index(name='tweet_count')
date_df = data.groupby('tweet_date')['tweet_id'].count().reset_index(name='tweet_count')
    #content
song_df = data.groupby('track_id')['tweet_id'].count().reset_index(name='tweet_count').sort_values(by = 'tweet_count', ascending=False)
user_df = data.groupby('user_id')['tweet_id'].count().reset_index(name='tweet_count').sort_values(by = 'tweet_count', ascending=False)
artist_df = data.groupby('artist_name')['tweet_id'].count().reset_index(name='tweet_count').sort_values(by = 'tweet_count', ascending=False)
    
    #visualization between number of tweets and various context types
def visual(df):
    x = df.columns.tolist()[0]
    y = df.columns.tolist()[1]
    plt.xlabel(x)
    plt.ylabel(y)
    plt.bar(np.arange(len(df[x])),df[y])
    plt.xticks(np.arange(len(df[x])), df[x])
    plt.show() 

    #filter out an attribute with specific value, eg. female artists        
def myfilter(attribute, value):
    is_value = data[attribute]==value #bool value
    filtered = data[is_value]
    return filtered

In [None]:
#---------------------------
# STATISTICAL TEST
#---------------------------
def chi2test(df): #test of independence to find context attributes that are statistically significant
    print(df.head())
    print()
    #run Pearson's Chi-Square Test
    stat, p, dof, expected = chi2_contingency(df)
    print('degree of freedom = %d' % dof)
    # interpret test-statistic
    prob = 0.99
    critical = chi2.ppf(prob, dof)
    print('probability = %.3f, critical value = %.3f, statistics = %.3f' % (prob, critical, stat))
    if abs(stat) >= critical:
        print('=> Two variables are dependent')
    else:
        print('=> Two variables are independent')
    # interpret p-value
    alpha = 1.0 - prob
    print('significance level = %.3f, p-value = %.3f' % (alpha, p))
    if p <= alpha:
        print('=> Two variables are dependent')
    else:
        print('=> Two variables are independent')

#-------------------------------
# CONTEXT MATRICES FOR THE TEST
#-------------------------------
    #time of the day
timebased_df = data.groupby(['user_id','tweet_time'])['tweet_id'].count().unstack(fill_value=0)
    #day of the week
daybased_df = data.groupby(['user_id','tweet_day'])['tweet_id'].count().unstack(fill_value=0)
daybased_df['weekday'] = daybased_df['weekday'] / 5
daybased_df['weekend'] = daybased_df['weekend'] / 2
    #date of the month
datebased_df = data.groupby(['user_id','tweet_date'])['tweet_id'].count().unstack(fill_value=0)
    #month of the year
monthbased_df = data.groupby(['user_id','tweet_month'])['tweet_id'].count().unstack(fill_value=0)

In [None]:
#---------------------------
# ARTIST TYPE BIASNESS
#---------------------------
artist_type = pd.pivot_table(data, index = 'user_id', values = 'tweet_id', columns = 'artist_type', 
                             aggfunc='count', fill_value=0)
artist_type['Total'] = artist_type.sum(1)
artist_type['1'] = artist_type['1'] / artist_type['Total']
artist_type['2'] = artist_type['2'] / artist_type['Total'] 
artist_type['3'] = artist_type['3'] / artist_type['Total']
artist_type['4'] = artist_type['4'] / artist_type['Total']
artist_type['5'] = artist_type['5'] / artist_type['Total']
artist_type['6'] = artist_type['6'] / artist_type['Total']
artist_type['\\N'] = 0
artist_type['Total'] = 1
artist_type = artist_type.round(2)
artist_type.to_csv(index=True, path_or_buf=path + "artist_type.csv")

#---------------------------
# ARTIST GENDER BIASNESS
#---------------------------
gender = pd.pivot_table(data, index = 'user_id', values = 'tweet_id', columns = 'artist_gender', 
                        aggfunc='count', fill_value=0)
gender['Total'] = gender.sum(1)
gender['1'] = gender['1'] / gender['Total']
gender['2'] = gender['2'] / gender['Total'] 
gender['3'] = gender['3'] / gender['Total']
gender['4'] = gender['4'] / gender['Total']
gender['\\N'] = 0
gender['Total'] = 1
gender = gender.round(2)
gender.to_csv(index=True, path_or_buf= path + "gender_bias.csv")

#---------------------------
# USER TWEETTIME BIASNESS
#---------------------------
ttime = pd.pivot_table(data, index = 'user_id', values = 'tweet_id', columns = 'tweet_time', 
                       aggfunc='count', fill_value=0)
ttime['Total'] = ttime.sum(1)
ttime['Afternoon'] = ttime['Afternoon'] / ttime['Total']
ttime['Evening'] = ttime['Evening'] / ttime['Total']
ttime['Morning'] = ttime['Morning'] / ttime['Total']
ttime['Night'] = ttime['Night'] / ttime['Total']
ttime['Total'] = 1
ttime = ttime.round(2)
ttime.to_csv(index=True, path_or_buf=path + "ttime.csv")