In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import prince
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
plt.style.use("ggplot")

## Import the data

In [20]:
IND_a= pd.read_csv("../Datasets/United States_2021_access.csv")
IND_u= pd.read_csv('../Datasets/United States_2021_usage.csv')
y= IND_a['fin34a'].copy()
IND_a= IND_a.drop('fin34a', axis=1)
IND_u= IND_u.drop('fin34a', axis=1)

## Implementing MCA

In [4]:
encoder = OneHotEncoder()

# Fitting and transforming the data
one_hot_encoded_IND_access = encoder.fit_transform(IND_a)

# Converting the sparse matrix to a dense numpy array and adding the weight column
final_data = pd.concat([pd.DataFrame(one_hot_encoded_IND_access.toarray())], axis=1)

# Printing the final one-hot encoded data
mca_IND_a = prince.MCA(n_components= len(final_data))
mca_IND_a.fit(final_data)
IND_access_index = mca_IND_a.transform(final_data)
IND_access_index

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15
1,-0.165270,0.190061,-0.654623,-0.177097,-0.044368,0.000303,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.141464e-15
2,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15
3,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15
4,0.197672,-0.062080,0.394593,-0.497968,0.410547,-0.033301,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.085953e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,-0.165270,0.190061,-0.654623,-0.177097,-0.044368,0.000303,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.141464e-15
1003,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15
1004,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15
1005,-0.193411,-0.147510,0.099630,0.116983,-0.048239,0.000657,0.982892,-0.148737,-0.018316,0.043583,0.067893,0.070397,-1.252487e-15


In [5]:
cumulative_variance_IND_a= mca_IND_a.eigenvalues_summary
cumulative_variance_IND_a

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.289,38.57%,38.57%
1,0.165,22.02%,60.59%
2,0.123,16.35%,76.94%
3,0.086,11.47%,88.41%
4,0.076,10.11%,98.52%
5,0.011,1.48%,100.00%
6,0.0,0.00%,100.00%
7,0.0,0.00%,100.00%
8,0.0,0.00%,100.00%
9,0.0,0.00%,100.00%


In [6]:
cumulative_variance_IND_a["% of variance (cumulative)"]= cumulative_variance_IND_a["% of variance (cumulative)"].str.slice(stop=-1)
cumulative_variance_IND_a["% of variance (cumulative)"]= cumulative_variance_IND_a["% of variance (cumulative)"].astype(float)
cumulative_variance_IND_a["% of variance (cumulative)"]= cumulative_variance_IND_a.loc[cumulative_variance_IND_a["% of variance (cumulative)"] < 100, "% of variance (cumulative)"]     

In [7]:
comp_IND_a= len(cumulative_variance_IND_a["% of variance (cumulative)"].value_counts(dropna= False))-1

In [8]:
IND_access_index= IND_access_index.iloc[:, :comp_IND_a]

In [9]:
one_hot_encoded_IND_usage = encoder.fit_transform(IND_u)

# Converting the sparse matrix to a dense numpy array and adding the weight column
final_data = pd.concat([pd.DataFrame(one_hot_encoded_IND_usage.toarray())], axis=1)

# Printing the final one-hot encoded data
mca_IND_u = prince.MCA(n_components= len(final_data))
mca_IND_u.fit(final_data)
IND_usage_index = mca_IND_u.transform(final_data)
IND_usage_index

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,-0.556990,0.508113,0.045489,-0.043841,0.009143,0.152761,0.032656,0.225863,-0.214879,-0.389259,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,2.602085e-17
1,-0.196070,-0.077377,-0.046226,-0.008713,0.062356,-0.179325,0.270711,0.131218,-0.071355,0.033592,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,1.040834e-17
2,-0.079599,-0.198935,-0.046393,-0.033444,0.049648,-0.037998,0.377978,-0.025625,0.135305,-0.051433,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,0.000000e+00
3,0.073932,-0.409553,-0.057983,0.023477,0.064082,0.019753,-0.229726,0.015679,0.052929,-0.426038,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,8.673617e-18
4,-0.349298,0.411145,0.030115,-0.054666,0.074061,0.283126,-0.008646,-0.411247,-0.148493,0.092112,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,1.908196e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,-0.357090,0.443540,0.037014,-0.035001,0.003691,0.180223,-0.115304,0.242065,-0.409778,-0.022440,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,2.775558e-17
1003,-0.169223,-0.260663,-0.059624,0.003781,0.112802,-0.185195,0.105065,0.044061,-0.059203,0.008438,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,6.938894e-18
1004,-0.323905,0.497228,0.069539,-0.065785,-0.113941,0.401669,0.326104,-0.286575,0.139263,-0.130398,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.981667e-16,-8.756789e-17,-8.494386e-17,2.081668e-17
1005,-0.045252,0.076308,0.021560,0.111492,-0.405174,-0.667139,-0.041149,0.026482,0.087566,0.049895,...,-0.000001,0.000008,-0.000005,0.000003,0.000008,0.000008,6.842889e-16,-8.756789e-17,-8.494386e-17,5.551115e-17


In [10]:
cumulative_variance_IND_u= mca_IND_u.eigenvalues_summary
cumulative_variance_IND_u

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.218,15.84%,15.84%
1,0.147,10.72%,26.56%
2,0.136,9.87%,36.43%
3,0.127,9.21%,45.64%
4,0.093,6.78%,52.43%
...,...,...,...
70,0.000,0.00%,100.00%
71,0.000,0.00%,100.00%
72,0.000,0.00%,100.00%
73,0.000,0.00%,100.00%


In [11]:
cumulative_variance_IND_u["% of variance (cumulative)"]= cumulative_variance_IND_u["% of variance (cumulative)"].str.slice(stop=-1)
cumulative_variance_IND_u["% of variance (cumulative)"]= cumulative_variance_IND_u["% of variance (cumulative)"].astype(float)
cumulative_variance_IND_u["% of variance (cumulative)"]= cumulative_variance_IND_u.loc[cumulative_variance_IND_u["% of variance (cumulative)"] < 100, "% of variance (cumulative)"]     

In [12]:
comp_IND_u= len(cumulative_variance_IND_u["% of variance (cumulative)"].value_counts(dropna= False))-1

In [13]:
IND_usage_index= IND_usage_index.iloc[:, :comp_IND_u]
IND_usage_index

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,-0.556990,0.508113,0.045489,-0.043841,0.009143,0.152761,0.032656,0.225863,-0.214879,-0.389259,...,-0.023097,-0.096042,0.050594,0.051027,-0.038551,-0.022441,0.040594,0.004383,0.000815,-0.007865
1,-0.196070,-0.077377,-0.046226,-0.008713,0.062356,-0.179325,0.270711,0.131218,-0.071355,0.033592,...,-0.167668,-0.096331,-0.166954,-0.000210,0.183056,0.015271,-0.002203,-0.009691,0.062745,-0.070963
2,-0.079599,-0.198935,-0.046393,-0.033444,0.049648,-0.037998,0.377978,-0.025625,0.135305,-0.051433,...,-0.002358,0.018398,-0.020082,-0.047122,-0.103628,0.054125,-0.025053,-0.060826,0.058254,0.294431
3,0.073932,-0.409553,-0.057983,0.023477,0.064082,0.019753,-0.229726,0.015679,0.052929,-0.426038,...,-0.029429,0.080798,0.199090,-0.018184,0.042882,0.155560,0.006410,-0.050382,-0.033483,0.004544
4,-0.349298,0.411145,0.030115,-0.054666,0.074061,0.283126,-0.008646,-0.411247,-0.148493,0.092112,...,0.243327,0.131362,-0.149583,0.141691,-0.203097,-0.247266,-0.075833,-0.003175,-0.005802,0.270939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,-0.357090,0.443540,0.037014,-0.035001,0.003691,0.180223,-0.115304,0.242065,-0.409778,-0.022440,...,-0.033551,0.087046,-0.249728,0.050829,0.030861,0.051714,-0.022182,0.018698,-0.033926,-0.051430
1003,-0.169223,-0.260663,-0.059624,0.003781,0.112802,-0.185195,0.105065,0.044061,-0.059203,0.008438,...,0.000482,-0.074051,0.083252,-0.088961,-0.092537,0.082633,0.022907,-0.013107,-0.080067,0.016913
1004,-0.323905,0.497228,0.069539,-0.065785,-0.113941,0.401669,0.326104,-0.286575,0.139263,-0.130398,...,-0.110969,0.157041,-0.078756,0.069271,-0.039527,0.097106,0.064033,0.072492,-0.089964,-0.053951
1005,-0.045252,0.076308,0.021560,0.111492,-0.405174,-0.667139,-0.041149,0.026482,0.087566,0.049895,...,0.132621,0.036424,0.036277,0.030733,0.032043,0.088330,0.017784,0.032572,-0.049851,0.002261


In [14]:
X = pd.concat([IND_access_index, IND_usage_index], axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 42)

# Create a logistic regression object and fit to the training data
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Use the trained model to predict the class labels for the test data and evaluate the accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9126984126984127
