# Logistic Regression

In [93]:
import scipy.io as sio
from sklearn.preprocessing import normalize
from sklearn import linear_model as lm
import pandas as pd
import csv
import numpy as np

In [107]:
data_path = 'hashtag_vec.mat'

data = sio.loadmat(data_path)
org_account_type = data['training_labels'].T
org_account_data = data['training_data'] 
indiv_account_data = data['individual_data'] 
indiv_account_order = data['individual_account_order'] 

In [108]:
data.keys()

dict_keys(['training_labels', 'individual_account_order', 'individual_data', '__header__', 'training_data', '__version__', '__globals__'])

In [109]:
org_account_type.shape

(80, 1)

In [110]:
indiv_account_data.shape

(200, 186)

----
# Build Classifier

In [111]:
# Normalize the data set
normed_org_acnt_data = normalize(org_account_data, axis=1, norm='l2')
normed_indiv_data = normalize(indiv_account_data, axis=1, norm='l2')

In [112]:
# Fit the model to logistic function
logistic = lm.LogisticRegression(fit_intercept=True)
logistic.fit(normed_org_acnt_data, org_account_type)
print('LogisticRegression score: %f'
      % logistic.fit(normed_org_acnt_data, org_account_type).score(normed_org_acnt_data, org_account_type))

LogisticRegression score: 0.975000


  y = column_or_1d(y, warn=True)


In [113]:
logistic.coef_

array([[  2.83093864e+00,  -8.30646097e-01,   1.47366702e-01,
          6.37359801e-01,   1.34900328e-01,   8.86651218e-02,
          5.06173717e-01,   2.03839375e-01,   5.07548440e-02,
          8.43592930e-02,   1.55303775e-01,   5.57115859e-02,
          2.22764438e-01,  -3.47507318e-01,   1.59549099e-01,
          4.34055592e-02,   1.48367710e-01,   3.13412575e-01,
          6.28278626e-01,   1.65426062e-01,   2.57021465e-01,
          5.72995070e-02,   1.36595126e-01,   9.20808360e-02,
          1.18796276e-01,   8.52893933e-02,   8.46629799e-02,
          6.14746573e-02,   1.53922511e-01,   7.32784550e-02,
          6.75804281e-02,   1.38842716e-01,   2.28734824e-02,
          7.54887092e-02,   2.19003071e-02,  -6.20270221e-03,
          6.84501637e-02,   3.64070791e-02,   5.85730156e-02,
          2.65225437e-02,   2.20096682e-02,   2.17148872e-02,
         -1.51291772e-01,   4.05091940e-01,   6.83707566e-02,
         -2.68154456e-02,   4.83243822e-02,   4.58589701e-02,
        

In [114]:
np.argmax(logistic.coef_)

0

Once we fit the logistic model to the hand labeled popular accounts, we can use the logistic model to predict the type of the individual accounts and the strength of the belief. $logistic.predict$ produces the probability of the sample for each class in the model, where classes are ordered as they are in logistic.classes_ = array([0, 1]), 1 is prlife, 0 is prochoice

In [115]:
# Calculate the strength of the belief for individual accounts
strengths = logistic.predict_proba(normed_indiv_data)

In [116]:
pred = logistic.predict(normed_indiv_data)

In [117]:
file_dict = {}
file_dict['individual_account_order'] = indiv_account_order
file_dict['prochoice_strength'] = strengths[:,0]
file_dict['prolife_strength'] = strengths[:,1]
file_dict['predict_class'] = pred
sio.savemat('indiv_logistic_prop_hashtag.mat', file_dict, do_compression=True)

In [92]:
logistic.classes_

array([0, 1])

# Combine result from following diversity

In [31]:
classifier_df = pd.DataFrame(
    {
    "account_name": [s.strip() for s in indiv_account_order], 
    "prochoice_strength": strengths[:,0],
    "prolife_strength": strengths[:,1],
    "predict_class": pred
    }
)

classifier_df.head()

Unnamed: 0,account_name,predict_class,prochoice_strength,prolife_strength
0,ABabysRights,1,0.112896,0.887104
1,ABGYNservices,0,0.541203,0.458797
2,Abigailschumach,1,0.305871,0.694129
3,Alboteaparty,0,0.561752,0.438248
4,alexdelong_,1,0.141862,0.858138


There are 197 accounts in the result_df.

In [32]:
classifier_df.shape

(200, 4)

Add a column tells whether the user is a follower of the prolife account (1) or prochoice (-1) account or both (0).

In [33]:
prolife_followers = []
with open('./prolife_followers.csv', 'r') as f:
    reader = csv.reader(f)
    for item in list(reader):
        prolife_followers += item
assert len(set(prolife_followers))==100

prochoice_followers = []
with open('./prochoice_followers.csv', 'r') as f:
    reader = csv.reader(f)
    for item in list(reader):
        prochoice_followers += item
assert len(set(prochoice_followers))==100

classifier_df['follower of'] = [0 if user in prolife_followers and user in prochoice_followers 
                            else 1 if user in prolife_followers else -1 for user in classifier_df['account_name']]

Combine the classifier's result with the following diversity result.

In [34]:
following_proportion = pd.read_csv('proportion.csv')
combine_df = pd.merge(classifier_df, following_proportion, left_on='account_name', right_on='User', how='left')
result_df = combine_df.drop_duplicates()
result_df = result_df.drop('User', axis=1)
print("size of the data frame", result_df.shape)
result_df.tail()

size of the data frame (200, 7)


Unnamed: 0,account_name,predict_class,prochoice_strength,prolife_strength,follower of,prolife proportion,prochoice proportion
195,VLabarum,1,0.440136,0.559864,1,1.0,0.0
196,wafulkerson1,0,0.609548,0.390452,1,1.0,0.0
197,WarrenDavidson,1,0.2705,0.7295,1,0.9,0.1
198,WilsonRx,1,0.351691,0.648309,1,1.0,0.0
199,wordspurify27,0,0.578185,0.421815,-1,0.0,1.0


# Save the file

In [35]:
result_df.to_csv("hashtag_indiv_strength_diversity.csv", index=False)