# KMeans-Cluster Census DataSet

### Investigation: I am going to investigate the link between age and the amount of hours people work per week. Then I will investigate the differences in the results based on what sex each person is.

#### Importing Libraries

In [None]:
# Import relevant libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn.cluster import KMeans

#### Creating Functions

In [None]:
def OpenFiles(worksheet_name):
    df = pd.read_excel(pwd + '\Data\data_output.xlsx', sheet_name= worksheet_name) #reading the workbook but only importing the particular worsheets I need.
    return df

def DelUnCol(df):
    df = df.drop(columns = ['Unnamed: 0', 'sex'])
    return df

def ElbowMethod(dis, K):
    plt.figure(figsize=(16,8))
    plt.plot(K, dis, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method Showing The Optimal K Value')
    plt.show()
    
def KModel(df, K):
    KMeansMod = KMeans(n_clusters=K)
    KMeansMod.fit(df)
    
def KMeans_Scatter(df, n):
    # create new plot and data
    x = pd.array(df['age'])
    y = pd.array(df['hours-per-week'])
    X = np.array(list(zip(x, y)))

    colors = ['b', 'g', 'r', 'c', 'm', 'k', 'purple', 'pink', 'orange']
    markers = ['o', 'v', 'D', 's', 'x', '*', 'p', 'd', '+']

    plt.figure(figsize=(20, 15), dpi=100)
    plt.ylabel('hours-per-week')
    plt.xlabel('age')
    
    kmeans = KMeans(n_clusters=n).fit(X)

    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 250, c = 'yellow', label = 'Clusters')

    for i, l in enumerate(kmeans.labels_):
        plt.plot(x[i], y[i], color=colors[l], marker=markers[l])
    plt.show()
    kmeans.labels_

### Reading Files and Declaring needed Variables.

In [None]:
pwd = os.getcwd() #capturing the present working directory

#Opening the worksheets
data = OpenFiles('Final_Result')
male = OpenFiles('Final_MaleData')
female = OpenFiles('Final_FemaleData')

data.head(), male.head(), female.head()

In [None]:
"""The xlsx writer library seems to automatically put an autoincrement column in which in this case
is not neccessary so I can remove it here. Also, for the purposes of what I am investigating, I do not actually need the 'sex' column.
For recognising what sex each data set is representing I have named them appropriatly. So I can use the DelUnCol method to delete these uneeded columns."""

data = DelUnCol(data)
male = DelUnCol(male)
female = DelUnCol(female)

data.head(), male.head(), female.head()

Final_Result Worksheet DataSet which includes the data for both sex's combined.

In [None]:
dis = []
K = range(1,10)
for k in K:
    km_model = KMeans(n_clusters = k)
    km_model.fit(data)
    dis.append(km_model.inertia_)
    
dis

In [None]:
ElbowMethod(dis, K)

The optimal number of clusters is 2, as the elbow method above demonstrates. The 'elbow' point on the graph is at 2. Meaning n_clusters = 2.

In [None]:
n = 2
KModel(data, n)
KMeans_Scatter(data, n)

### Seperating the Male and Female Datasets

#### Male Data

In [None]:
dis = []
K = range(1,10)
for k in K:
    km_model = KMeans(n_clusters = k)
    km_model.fit(male)
    dis.append(km_model.inertia_)
    
dis

In [None]:
ElbowMethod(dis, K)

The optimal number of clusters is 2, as the elbow method above demonstrates. The 'elbow' point on the graph is at 2. Meaning n_clusters = 2/ n = 2.

In [None]:
n = 2
KModel(male, n)
KMeans_Scatter(male, n)

#### Female Data

In [None]:
dis = []
K = range(1,10)
for k in K:
    km_model = KMeans(n_clusters = k)
    km_model.fit(female)
    dis.append(km_model.inertia_)
    
dis

In [None]:
ElbowMethod(dis, K)

The optimal number of clusters is 2, as the elbow method above demonstrates. The 'elbow' point on the graph is at 2. Meaning n_clusters = 2/ n = 2.

In [None]:
n = 2
KModel(female, n)
KMeans_Scatter(female, n)