## Cluster tryout

# Step 1: load packages

In [32]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
def compute_bic(kmeans,X):
    """
    Computes the BIC metric for a given clusters

    Parameters:
    -----------------------------------------
    kmeans:  List of clustering object from scikit learn

    X     :  multidimension np array of data points

    Returns:
    -----------------------------------------
    BIC value
    """
    # assign centers and labels
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    #number of clusters
    m = kmeans.n_clusters
    # size of the clusters
    n = np.bincount(labels)
    #size of data set
    N, d = X.shape

    #compute variance for all clusters beforehand
    cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])
    const_term = 0.5 * m * np.log(N) * (d+1)

    BIC = np.sum([n[i] * np.log(n[i]) -
               n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term
    return(BIC)


# Step 2: Importing the data

In [33]:
df = pd.read_stata('../input/Clusterallcountry.dta', convert_categoricals=False)

# Step 3: Create table with Panda

In [34]:
display(df)

Unnamed: 0,mergeid,country,sp002_,sp003_1,sp003_2,sp003_3,sp004d1_1,sp004d1_2,sp004d1_3,sp004d2_1,sp004d2_2,sp004d2_3,sp004d3_1,sp004d3_2,sp004d3_3,sp005_1,sp005_2,sp005_3
0,AT-004855-02,11,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AT-010904-02,11,1,49,0,0,0,0,0,1,0,0,0,0,0,3,0,0
2,AT-011464-01,11,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AT-014640-01,11,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AT-015615-01,11,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11333,SE-995556-01,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11334,SE-998426-01,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11335,SE-998426-02,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11336,SE-998807-01,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Step 4: drop variables to exclude from clustering

In [35]:
df.drop('country', axis=1, inplace = True)

In [36]:
display(df)

Unnamed: 0,mergeid,sp002_,sp003_1,sp003_2,sp003_3,sp004d1_1,sp004d1_2,sp004d1_3,sp004d2_1,sp004d2_2,sp004d2_3,sp004d3_1,sp004d3_2,sp004d3_3,sp005_1,sp005_2,sp005_3
0,AT-004855-02,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AT-010904-02,1,49,0,0,0,0,0,1,0,0,0,0,0,3,0,0
2,AT-011464-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AT-014640-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AT-015615-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11333,SE-995556-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11334,SE-998426-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11335,SE-998426-02,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11336,SE-998807-01,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Step 5: set X(which variables) and C(if categorical or not)

In [48]:
X=df.to_numpy()

TypeError: bad operand type for unary -: 'str'

basic_feature_names = ['mergeid']
basic_feature_names.remove('mergeid')

In [39]:
C = [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]

initiallize empty kmeans model
then add data to model

a=np.array([[2, 0], [3, 0], [4,0], [5,0],[6,0],[7,0],[8,0],[9,0],[10,0]])
clusters=[2,3,4,5,6,7,8,9,10]
init=['k-means++','random']
parameters={'n_clusters': clusters,'init':init}
kmeans = KMeans(random_state=0, algorithm="auto")
clf = GridSearchCV(kmeans, parameters)
clf.fit(X)


bic=np.zeros(9,2)
for cluster in range(0,9): 
    for i in range(0,2):
        
        

print(a)

In [40]:
df.sp002_.value_counts()

5    8330
1    3008
Name: sp002_, dtype: int64

# Step 6: cluster analysis; bic; add cluster values to dataframe

In [45]:
a=np.array([[2, 0], [3, 0], [4,0], [5,0],[6,0],[7,0],[8,0],[9,0],[10,0]])

for i in range(0,9):

    kmeans = KMeans(n_clusters=a[i][0], random_state=0, init='k-means++',max_iter=10000, algorithm="full")
    kmeans.fit(X)
    clusters=kmeans.predict(X)
    a[i][1]=compute_bic(kmeans,X)
    print(dict(Counter(clusters)))
    i = str(i)
    df['clusters_'+i]=clusters


display(df)
plt.plot(a[:,0],a[:,1])
plt.savefig('bic.png')

TypeError: unsupported operand type(s) for -: 'str' and 'str'

# Step X: first try to figure out how the clusters are generated

In [None]:
df['clusters']=clusters

newdf=df[(df.sp002_==1)&(df.clusters==2)]
display(newdf)
print(newdf.sp003_1.value_counts())
print(newdf.sp005_1.value_counts())

In [None]:
print(df.clusters_0.value_counts())
print(df.clusters_1.value_counts())
print(df.clusters_2.value_counts())
print(df.clusters_3.value_counts())
print(df.clusters_4.value_counts())
print(df.clusters_5.value_counts())
print(df.clusters_6.value_counts())
print(df.clusters_7.value_counts())
print(df.clusters_8.value_counts())

# Step Z: Set dataframe to statafile

In [None]:
df.to_stata('clusterdone.dta')  