# Logistic Regression

In [181]:
import scipy.io as sio
from sklearn.preprocessing import normalize
from sklearn import linear_model as lm
import pandas as pd
import csv
import numpy as np

In [182]:
word_vec = pd.read_csv('select_word.csv', sep=',',header=None).values
word_vec = word_vec[1:,0]
hashtag_vec = pd.read_csv('select_hashtag.csv', sep=',',header=None).values
hashtag_vec = hashtag_vec[1:,0]

----
# Build Classifier

In [183]:
#data_path = 'hashtag_vec.mat'

def compute_logistic(data_path):
    data = sio.loadmat(data_path)
    print("data keys", data.keys())
    org_account_type = data['training_labels'].T
    org_account_data = data['training_data']
    org_account_order = data['known_account_order']
    indiv_account_data = data['individual_data'] 
    indiv_account_order = data['individual_account_order'] 

    # Normalize the data set
    normed_org_acnt_data = normalize(org_account_data, axis=1, norm='l2')
    normed_indiv_data = normalize(indiv_account_data, axis=1, norm='l2')

    # Fit the model to logistic function
    logistic = lm.LogisticRegression(fit_intercept=True)
    logistic.fit(normed_org_acnt_data, org_account_type)
    print('LogisticRegression score: %f'
          % logistic.fit(normed_org_acnt_data, org_account_type).score(normed_org_acnt_data, org_account_type))

    # Calculate the strength of the belief for accounts
    org_strengths = logistic.predict_proba(normed_org_acnt_data)
    indiv_strengths = logistic.predict_proba(normed_indiv_data)

    indiv_pred = logistic.predict(normed_indiv_data)
    org_pred = logistic.predict(normed_org_acnt_data)
    
    file_dict = {}
    file_dict['individual_account_order'] = indiv_account_order
    file_dict['org_account_order'] = org_account_order
    file_dict['indiv_prochoice_strength'] = indiv_strengths[:,0]
    file_dict['indiv_prolife_strength'] = indiv_strengths[:,1]
    file_dict['org_prochoice_strength'] = org_strengths[:,0]
    file_dict['org_prolife_strength'] = org_strengths[:,1]
    file_dict['indiv_class'] = indiv_pred
    file_dict['org_class'] = org_pred
    if 'hashtag' in data_path:
        mat_path = 'logistic_prop_{}.mat'.format('hashtag')
        indiv_combine_path = "indiv_strength_diversity_{}.csv".format('hashtag')
        org_combine_path = "org_strength_diversity_{}.csv".format('hashtag')
    else:
        mat_path = 'logistic_prop_{}.mat'.format('word')
        indiv_combine_path = "indiv_strength_diversity_{}.csv".format('word')
        org_combine_path = "org_strength_diversity_{}.csv".format('word')
        
    sio.savemat(mat_path, file_dict, do_compression=True) 
    combine_diversity_logistic(indiv_account_order, indiv_strengths[:,0], indiv_strengths[:,1], indiv_pred, 
                               indiv_combine_path)
    combine_diversity_logistic(org_account_order, org_strengths[:,0], org_strengths[:,1], org_pred, 
                               org_combine_path)
    return logistic.coef_

def combine_diversity_logistic(account_order, prochoice_strength, prolife_strength, pred_class, save_path):
    classifier_df = pd.DataFrame({
                        "account_name": [s.strip() for s in account_order], 
                        "prochoice_strength": prochoice_strength,
                        "prolife_strength": prolife_strength,
                        "predict_class": pred_class
                        })
    if "indiv" in save_path:
        prolife_followers = []
        with open('./prolife_followers.csv', 'r') as f:
            reader = csv.reader(f)
            for item in list(reader):
                prolife_followers += item
        assert len(set(prolife_followers))==100

        prochoice_followers = []
        with open('./prochoice_followers.csv', 'r') as f:
            reader = csv.reader(f)
            for item in list(reader):
                prochoice_followers += item
        assert len(set(prochoice_followers))==100

        classifier_df['follower of'] = [0 if user in prolife_followers and user in prochoice_followers 
                            else 1 if user in prolife_followers else -1 for user in classifier_df['account_name']]
        following_proportion = pd.read_csv('proportion.csv')
        combine_df = pd.merge(classifier_df, following_proportion, left_on='account_name', right_on='User', how='left')
        result_df = combine_df.drop_duplicates()
        result_df = result_df.drop('User', axis=1)
    else:
        prolife_proportion = pd.read_csv('prolife_proportion.csv')
        prochoice_proportion = pd.read_csv('prochoice_proportion.csv')
        result_df = pd.concat([prolife_proportion, prochoice_proportion])
        result_df = pd.merge(classifier_df, result_df, left_on='account_name', right_on='User', how='left')
        result_df = result_df.drop('User', axis=1)
        result_df['account_type'] = [1]*40+[0]*40
    result_df.to_csv(save_path, index=False)
        

In [184]:
word_logistic_coef = compute_logistic("word_vec.mat")
hashtag_logistic_coef = compute_logistic("hashtag_vec.mat")

data keys dict_keys(['__header__', 'individual_data', '__globals__', '__version__', 'training_data', 'known_account_order', 'training_labels', 'individual_account_order'])
LogisticRegression score: 0.987500
data keys dict_keys(['__header__', 'individual_data', '__globals__', '__version__', 'training_data', 'known_account_order', 'training_labels', 'individual_account_order'])
LogisticRegression score: 0.987500


  y = column_or_1d(y, warn=True)


Once we fit the logistic model to the hand labeled popular accounts, we can use the logistic model to predict the type of the individual accounts and the strength of the belief. $logistic.predict$ produces the probability of the sample for each class in the model, where classes are ordered as they are in logistic.classes_ = array([0, 1]), 1 is prolife, 0 is prochoice

In [185]:
# Pick out words with the largest coefficient in absolute value.
coef=abs(hashtag_logistic_coef.flatten())
sig = np.argsort(-coef)
sig = sig.flatten()
ind = sig[0:20]
coef[ind]
hashtag_vec[ind]

array(['prolife ', 'repealthe8th ', 'prochoice ', 'beboldendhyde ',
       'stopthesham ', 'defundpp ', 'protestpp ', 'reprojustice ',
       'shoutyourabortion ', 'reprorights ', 'ppsellsbabyparts ',
       'periscope ', 'plannedparenthood ', 'icommittoprayread ',
       'givingtuesday ', '9daysforlife ', 'womensmarch ',
       'prolifewomen16 ', 'demsinphilly ', '40daysforlife '], dtype=object)