Loading and understanding the data - Preparing to select columns as predictors.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve
import sklearn.model_selection as model_selection
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer 
from yellowbrick.cluster import KElbowVisualizer
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

In [None]:
codecamp_coders_survey = pd.read_csv("/kaggle/input/2021-new-coder-survey/2021 New Coder Survey.csv")
codecamp_coders_survey.head()

In [None]:
codecamp_coders_survey.columns

In [None]:
codecamp_coders_survey.info()

In [None]:
codecamp_coders_survey.keys()

Selecting my predictors and Target is Income

In [None]:
df_survey_attr = pd.DataFrame(columns=['Hours_Learning','Months_Programming','Age','Relocate','Race','Area','Education_Level','Income'])
df_survey_attr['Hours_Learning'] = codecamp_coders_survey.iloc[:, 7] 
df_survey_attr['Months_Programming'] = codecamp_coders_survey.iloc[:, 8] 
df_survey_attr['Relocate'] = codecamp_coders_survey.iloc[:, 19] 
df_survey_attr['Age'] = codecamp_coders_survey.iloc[:, 23] 
df_survey_attr['Race'] = codecamp_coders_survey.iloc[:, 25] 
df_survey_attr['Area'] = codecamp_coders_survey.iloc[:, 26] 
df_survey_attr['Education_Level'] = codecamp_coders_survey.iloc[:, 32] 
df_survey_attr['Income'] = codecamp_coders_survey.iloc[:, 22]    
df_survey_attr.head()

In [None]:
print("Total Rows: ",len(df_survey_attr.index))

Understanding Income distribution

In [None]:
sns.countplot(y=df_survey_attr['Income'], data=codecamp_coders_survey, order =df_survey_attr['Income'].value_counts(ascending=False).index)
plt.xlabel("count")
plt.ylabel("income")
plt.show()

Data Preparation & Cleaning - Transforming target variable into two classes

In [None]:
df_survey_attr["Income"].unique()

replacing NaN with the mode of the data.

Removing missing values

In [None]:
print(df_survey_attr["Income"].mode()[0])
df_survey_attr["Income"].fillna('None', inplace=True)
df_survey_attr.head()

In [None]:
df_survey_attr = df_survey_attr[df_survey_attr["Income"] != 'I don’t know']
print(df_survey_attr["Income"].unique())

In [None]:
df_survey_attr = df_survey_attr[df_survey_attr["Income"] != "I don't want to answer"]
print(df_survey_attr["Income"].unique())

replacing none with under 1000

In [None]:
df_survey_attr["Income"] = df_survey_attr["Income"].replace('None','Under $1,000')
print(df_survey_attr["Income"].unique())

Ordinal-encoding Income

In [None]:
income_level_order = [[
                       'Under $1,000',
                       '$1,000 to $2,999',
                       '$3,000 to $4,999',
                       '$5,000 to $6,999', 
                       '$7,000 to $9,999',
                       '$10,000 to $14,999',
                       '$15,000 to $19,999',
                       '$20,000 to $24,999',
                       '$25,000 to $29,999',
                       '$30,000 to $34,999', 
                       '$35,000 to $39,999',
                       '$40,000 to $49,999',
                       '$50,000 to $59,999',
                       '$60,000 to $74,999',
                       '$75,000 to $89,999',
                       '$90,000 to $119,999',
                       '$120,000 to $159,999',
                       '$160,000 to $199,999', 
                       '$200,000 to $249,999',
                       '$250,000 or over',]] 

encoder = OrdinalEncoder(categories = income_level_order)  
df_survey_attr['IncomeLevel'] = encoder.fit_transform(df_survey_attr[['Income']])
df_survey_attr

Visualize Income in levels

In [None]:
sns.countplot(y='IncomeLevel', data=df_survey_attr)

New High Income variable if Income over $29,999 (9) then you are considered high income.

In [None]:
df_survey_attr['HighIncome'] = df_survey_attr['IncomeLevel'].apply(lambda x:0 if x < 9 else 1)
df_survey_attr.head()

In [None]:
df_survey_attr.isna().sum()

Cleaning and preparing Hours_learning

In [None]:
df_survey_attr["Hours_Learning"].unique()

In [None]:
df_survey_attr["Hours_Learning"].isnull().sum()

In [None]:
df_survey_attr["Hours_Learning"].max()

In [None]:
df_survey_attr['Hours_Learning'].plot(kind='box', title= 'Hours Learning plot')

In [None]:
print(df_survey_attr['Hours_Learning'].quantile([0.25, 0.5, 0.75]))

In [None]:
Quart1 = df_survey_attr['Hours_Learning'].quantile(0.25)
Quart3 = df_survey_attr['Hours_Learning'].quantile(0.75)
IQR = Quart3-Quart1
Outliers = df_survey_attr[(df_survey_attr['Hours_Learning']>(Quart3+1.5*IQR))|(df_survey_attr['Hours_Learning']<(Quart1-1.5*IQR))]
print(Outliers['Hours_Learning'])

Average sleep time is 7-9 hours if we take the lower bout of 7 hours that equates to 49 hours of sleep per week. Hence the max hours any human can do work a week is 101. As a result i will be replacing all values above this value.

In [None]:
median = df_survey_attr["Hours_Learning"].median()
for x in df_survey_attr["Hours_Learning"]:
    if x > 101:
        df_survey_attr["Hours_Learning"] = df_survey_attr["Hours_Learning"].replace(x, median)

In [None]:
median = df_survey_attr["Hours_Learning"].median()
df_survey_attr["Hours_Learning"].fillna(median, inplace=True)
df_survey_attr.head()

Preparing and cleaningn the Months programming column.

In [None]:
df_survey_attr["Months_Programming"].unique()

In [None]:
df_survey_attr["Months_Programming"].isnull()

In [None]:
df_survey_attr["Months_Programming"].isnull().sum()

In [None]:
df_survey_attr.Months_Programming = pd.to_numeric(df_survey_attr.Months_Programming, errors='coerce').fillna(0)
df_survey_attr.head()

In [None]:
df_survey_attr['Months_Programming'].unique()

In [None]:
df_survey_attr["Months_Programming"].plot(kind='box', title= 'Months Programming plot')

Need to replace the outlier, 1.0000e+41

In [None]:
median = df_survey_attr["Months_Programming"].median().astype(np.float64)
for x in df_survey_attr["Months_Programming"]:
    if x > 647:
        df_survey_attr["Months_Programming"] = df_survey_attr["Months_Programming"].replace(x, median)

In [None]:
df_survey_attr["Months_Programming"].plot(kind='box', title= 'Months Programming plot')

In [None]:
df_survey_attr["Months_Programming"].max()

Cleaning and preparing age

In [None]:
df_survey_attr["Age"].unique()

In [None]:
df_survey_attr["Age"].isnull().sum()

In [None]:
median = df_survey_attr["Age"].median()
df_survey_attr["Age"].fillna(median, inplace=True)
df_survey_attr.head()

In [None]:
df_survey_attr["Age"].plot(kind='box', title= 'Age plot')

Removing Some of the outliers over 84

In [None]:
wrong_ages = [100,99,95,89,120,88,84,82,83,80]
for i in wrong_ages:
    df_survey_attr['Age'] = df_survey_attr['Age'].replace(i, median)

In [None]:
df_survey_attr["Age"].unique()

In [None]:
df_survey_attr["Age"].plot(kind='box', title= 'Age plot')

Will accept rest of outliers as takes in age

Preparing and cleaning Education_Level

In [None]:
df_survey_attr["Education_Level"]. unique()

In [None]:
mode = df_survey_attr["Education_Level"]. mode()
df_survey_attr["Education_Level"].fillna(mode[0], inplace=True)
df_survey_attr["Education_Level"].unique()

In [None]:
incorrect_val = 27.081495040151157
df_survey_attr["Education_Level"] = df_survey_attr["Education_Level"].replace(incorrect_val, mode[0])
df_survey_attr["Education_Level"].unique()

In [None]:
df_survey_attr['Education_Level'] = df_survey_attr['Education_Level'].replace('Some high school',  'No high school (secondary school)')
df_survey_attr['Education_Level'] = df_survey_attr['Education_Level'].replace('Some college credit, no degree', 'High school diploma or equivalent (GED)')

In [None]:
df_survey_attr.head(12)

Cleaning and preparing Area

In [None]:
df_survey_attr['Area'].unique()

In [None]:
mode = df_survey_attr["Area"]. mode()
df_survey_attr["Area"].fillna(mode[0], inplace=True)
df_survey_attr["Area"].unique()

In [None]:
df_survey_attr["Area"] = df_survey_attr["Area"].replace(incorrect_val, mode[0])
df_survey_attr["Area"].unique()

In [None]:
df_survey_attr.head(12)

Cleaning and preparing Relocate

In [None]:
df_survey_attr['Relocate'].unique()

In [None]:
mode = df_survey_attr['Relocate']. mode()
df_survey_attr['Relocate'].fillna(mode[0], inplace=True)
df_survey_attr['Relocate'].unique()

In [None]:
incorrect_val = 'I am not interested in a software development career'
df_survey_attr["Relocate"] = df_survey_attr["Relocate"].replace(incorrect_val, mode[0])
df_survey_attr['Relocate'].unique()

Preparing and cleaning Race

In [None]:
df_survey_attr['Race'].unique()

In [None]:
#df["Race"].info()
df_survey_attr["Race"].nunique()

In [None]:
mode = df_survey_attr["Race"].mode()
print(mode)

In [None]:
incorrect_vals = ['mut',
'A person',
'None of them. I do not find ethnicity as a suitiable label for myself.',
"I don't identify with this idea, may the future hurry and get here now. ",
'Is this even necessary?',
'definerace',
"I don't identify with any group.",
'Antifa',
'Not any particular group',
'I have not met different people so I can not form an opinion on that matter',
'equally',
"I'd rather not :-)",
'Fuck off',
'Decline to State',
"I really don't care for this type of grouping ",
"don't matter",
'Perfer not to say',
'Prefer not to answer ',
'thats loaded, i live in a rural community, i identify as a free thinking individual limited by societals perception of me.',
'Huwy',
'Earthling',
'IDK. IDC.',
"You cannot chose your race, therefor race is not an 'identity'. The premise that this question is based on, is overtly delusional. Questions like this promote a worldview of persecution for whites, Jews and Christians.",
'Human (this kind of questions are too racist)',
'let me be just human',
"It doesn't matter...",
'ugh! I hate this question!',
'None',
"That's one hell of a stupid question IMO.",
'any',
'Cyborg',
'узбекистан',
 'None, every human being is an individual, races should not matter!',
'My ARN is Modernazed I have mushrooms on my skin.',
'a man from earth <3',
'Skin color does not matter',
'Prefer not say',
'no respondo',
'i am just a human',
'Person',
'i do not think this is relevant. ',
'Ninja',
'none of those, I identify myself as a "person" like any other person in the world.',
"as a sociologist i'm surprised this is how this question is framed",
'Non-racist',
'None of your business',
'I despise this whole grouping people into races.',
'Wtf?!',
"Don't think of myself in these ethnic labels",
'why does this matter?',
'a human',
'person',
'? why should this be important',
'is this important ?',
'people are equal, that is why i prefer not to say or group myself in any kind of race',
"doesn't matter",
'Homo Sapien',
"Don't really care about it nor fit any of these.",
'Na',
'dont ask personal info',
'I dont really know',
'I found no difference of groups... I like to work with all of them!  ',
'Chestnut-Espresso',
'None, white is far too limiting. I am from Europe we are all white, but very culturally diverse',
'As a human being, a good one.',
'prefer not to say',
"Don't Identify",
'Human Being',
'none ',
'?',
'Ni',
'Earth born',
'WTF ',
'Prefer NOt to say',
'This isnt a good question, i identify myself as flying turtle',
'I don\'t "identify" myself by "race".',
'not applicable',
'strange question. I primarily identify as a human.',
'Alien',27.081495040151157,'With anyone', 'human', 'Human ','Human.',
'Humanbeing','racist question', 'Human','Prefer not to say','people',
'none','Homo Sapiens', 'earthlings \U0001fa90','alien','Idk', 'why is that relevant?',
'Nunya','Pop-Tartian or Snickers Barrian','Human being','A person.','crocodile','Atomic Submarine',
'any nationality', 'Jedi','Jewish/Christian.', 'jewish minorities'
, 'Jew', 'Jewish','Human species']

for i in incorrect_vals:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, mode[0])
    
df_survey_attr["Race"].nunique()

In [None]:
df_survey_attr["Race"].fillna(mode[0], inplace=True)

In [None]:
df_survey_attr.loc[df_survey_attr['Race'].str.contains('biracial', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('mix', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('half', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('and', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('two', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('multi', case=False), 'Race'] = 'Mixed Race'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('all', case=False), 'Race'] = 'Mixed Race'

In [None]:
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Hispanic', case=False), 'Race'] = 'Hispanic/Latino(a/x)'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Latina', case=False), 'Race'] = 'Hispanic/Latino(a/x)'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Latino', case=False), 'Race'] = 'Hispanic/Latino(a/x)'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('South America', case=False), 'Race'] = 'Hispanic/Latino(a/x)'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('South American', case=False), 'Race'] = 'Hispanic/Latino(a/x)'

In [None]:
df_survey_attr.loc[df_survey_attr['Race'].str.contains('black', case=False), 'Race'] = 'Black/African American'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Nigeria', case=False), 'Race'] = 'Black/African American'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Africa', case=False), 'Race'] = 'Black/African American'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Color', case=False), 'Race'] = 'Black/African American'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Caribbean', case=False), 'Race'] = 'Black/African American'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Afrolatin', case=False), 'Race'] = 'Black/African American'

In [None]:
df_survey_attr.loc[df_survey_attr['Race'].str.contains('white', case=False), 'Race'] = 'White'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Slavic', case=False), 'Race'] = 'White'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Europe', case=False), 'Race'] = 'White'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('English', case=False), 'Race'] = 'White'

In [None]:
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Asia', case=False), 'Race'] = 'Asian'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Asian', case=False), 'Race'] = 'Asian'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Indian', case=False), 'Race'] = 'Asian'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('India', case=False), 'Race'] = 'Asian'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Japanese', case=False), 'Race'] = 'Asian'
df_survey_attr.loc[df_survey_attr['Race'].str.contains('Korean', case=False), 'Race'] = 'Asian'

In [None]:
others =[ 'Turk', 'NZ Maori', 'Mediterranean',
       'brown', 'Greek', 'Native American- Dine', 'Mestizo',
       'Кавказ', 
       'Anatolian (Turk)', 'Armenian', 'brasilian',
       'Polynesian', 'Turkish',
       ' Parda ',
       'Although I am dark or brown by skin colour, I wish this question was asked based on countries',
       'Moorish American', 
       "greek I don't know which one of these a greek is",
       'Basque', 'Silesian', 'Coloured', 'arab', 
       'mammalian', 'brazilian', 'parda',
       'Swede', 
       'I am from Nepal.', 'coloured ', 'dutch', 'Brazilian', 'Slav',
       'Gypsish',
       'The Norsemen',
       'turkish', 'Klingon', 'Argentino <3',
       'Israeli', 'pardo', 'belarussian',
       'coloured', 'Iberian',
       'North Eastern', 'Melanesia', 'Uto-aztecan', 'Surinamese',
       'Kurdish / Kurdistani', 'Hawaiian',
       'Anatolian', 'Caucasus',
       'Middle Eastern', 'Kurdish',
       'Hawaiian/Okinawan', 'Melungeon',
       'Maori', 'Melanisian', 'Turkic', 'suramericana', 'Panama Native',
       'Native American', 'Irish', 'international', 'Circassian ',
       'latin american', 'Gypsy', 'Parda', 'Samoan ',
       'greek',
       'Native Hawaiian', 'Sri Lankan Aussie', 'Moroccan ',
       'indigenous ', 'Brown','Pardo','Earth', 'excuse me?']

for i in others:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'Other')

In [None]:
Extra_BlackAfrican=['Ethiopian','Kenyan','Affrican','ethiopian','Tunisian', 'kenyan','Bajan', 'Cameroonian','Egyptian', 'Jamaican', 'senegal','Egyptian ']

for i in Extra_BlackAfrican:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'Black/African American')

In [None]:
Extra_White = ['Eurpean', 'Russia','Irish/German','Celtic, Anglo Saxon with Druidic roots.', 'Russian T-14', 'German', 'Canadian', 'american','Russian','American ','American', 'Australian ']

for i in Extra_White:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'White')

In [None]:
Extra_Asian =['Chinese','Pakistani','Bangladeshi', 'Pakistan','Sri Lankan','Hongkonger','China',
'indonesian','chinese', 'Filipino','pakistan','Bangladesh']

for i in Extra_Asian:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'Asian')

In [None]:
Extra_Mixed_Race = ['bi racial','2 or more races','filipino hawaiian', 'mulitiracial']
for i in Extra_Mixed_Race:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'Mixed Race')

In [None]:
Ex_HispanicLatino = ['Im from Argentina','Latin','Mexican American','Puerto Rican', 'mexican']
for i in Ex_HispanicLatino:
    df_survey_attr['Race'] = df_survey_attr['Race'].replace(i, 'Hispanic/Latino(a/x)')

In [None]:
df_survey_attr["Race"].nunique()

In [None]:
df_survey_attr["Race"].unique()

In [None]:
len(df_survey_attr.index)

In [None]:
df_survey_attr.head()

All columns has now been prepared and cleaned

EDA

In [None]:
sns.countplot(y='HighIncome', data=df_survey_attr)

In [None]:
df_survey_attr.Relocate.value_counts().plot(kind='pie',autopct='%1.1f%%')
plt.title('Are You willing to relocate for a job')

In [None]:
df_survey_attr.Race.value_counts().plot(kind='pie',autopct='%1.1f%%', fontsize=8)
plt.title('With which of these groups do you primarily identify?')

In [None]:
df_survey_attr.Area.value_counts().plot(kind='pie',autopct='%1.1f%%')
plt.title('Which part of the world do you live in?')

In [None]:
df_survey_attr.Education_Level.value_counts().plot(kind='pie',autopct='%1.1f%%', fontsize=8)
plt.title('What is the highest level of school you completed?')

In [None]:
df_survey_attr["Months_Programming"].plot(kind='box', title= 'Months Programming plot')

In [None]:
df_survey_attr["Hours_Learning"].plot(kind='box', title= 'Hours Learning plot')

In [None]:
df_survey_attr.Race.value_counts().plot(kind='bar')

In [None]:
Highestincome_race = df_survey_attr.groupby('Race')['HighIncome'].sum()
Highestincome_race.plot(kind='bar')
plt.xlabel('Race')
plt.ylabel('HighIncome')
plt.title('HighIncome Race')
plt.show()

In [None]:
Highestincome_edlvl = df_survey_attr.groupby('Education_Level')['HighIncome'].sum()
Highestincome_edlvl.plot(kind='bar')
plt.xlabel('Education_Level')
plt.ylabel('HighIncome')
plt.title('HighIncome Education Level')
plt.show()

In [None]:
Highestincome_area = df_survey_attr.groupby('Area')['HighIncome'].sum()
Highestincome_area.plot(kind='bar')
plt.xlabel('Area')
plt.ylabel('HighIncome')
plt.title('HighIncome Education Level')
plt.show()

In [None]:
sns.scatterplot(data=df_survey_attr, x="Age", y="Hours_Learning", hue="HighIncome")

In [None]:
sns.scatterplot(data=df_survey_attr, x="Age", y="Months_Programming", hue="HighIncome")

In [None]:
pd.plotting.scatter_matrix(df_survey_attr.iloc[: ,0:3])

In [None]:
sns.stripplot(data=df_survey_attr, x="Relocate", y="Age", hue="HighIncome")

In [None]:
fig, ax=plt.subplots(figsize=(15,8))

df_stack = df_survey_attr.pivot_table(index="Age",
               columns="Education_Level", 
               values="Hours_Learning",
               aggfunc=sum)

df_stack.plot.bar(stacked=True, ax=ax)

plt.xlabel("Age", fontsize=15)
plt.ylabel("Months_Programming", fontsize=15)

In [None]:
df_survey_attr.isnull().sum()

data clustering

In [None]:
df_cluster1 = df_survey_attr[['Area','Race','IncomeLevel']]
df_cluster1.head()

In [None]:
X_cluster = pd.get_dummies(df_cluster1) #OneHot Encoding. 
X_cluster.head()

In [None]:
km = KMeans(n_clusters=3)

In [None]:
y_cluster = km.fit_predict(X_cluster)
print(y_cluster[0:9])
df_cluster1['ClusterLabel'] = y_cluster 
df_cluster1.head()

In [None]:
sns.countplot(data=df_cluster1, x="ClusterLabel", hue="Area")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
sns.countplot(data=df_cluster1, x="ClusterLabel", hue="Race")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
sns.countplot(data=df_cluster1, x="ClusterLabel", hue="IncomeLevel")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
# Init KMeans clustering model and visualiser
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,10))
visualizer.fit(X_cluster)        # Fit the data to the visualiser
visualizer.show()

In [None]:
# 3 clusters
k=3
# Init KMeans clustering model and visualiser
km =  KMeans(k, random_state=1)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
visualizer.fit(X_cluster)        # Fit the data to the visualiser
visualizer.show()        # Finalise and render the figure
print(visualizer.silhouette_score_)

In [None]:
df_cluster2 = df_survey_attr[['Relocate','IncomeLevel']]
df_cluster2.head()

In [None]:
X_cluster1 = pd.get_dummies(df_cluster2) #OneHot Encoding. 
X_cluster1.head()

In [None]:
km = KMeans(n_clusters=4)

In [None]:
y_cluster = km.fit_predict(X_cluster1)
print(y_cluster[0:13])
df_cluster2['ClusterLabel'] = y_cluster 
df_cluster2.head()

In [None]:
sns.countplot(data=df_cluster2, x="ClusterLabel", hue="Relocate")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
sns.countplot(data=df_cluster2, x="ClusterLabel", hue="IncomeLevel")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
# Init KMeans clustering model and visualiser
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,10))
visualizer.fit(X_cluster1)        # Fit the data to the visualiser
visualizer.show()

In [None]:
# 4 clusters
k=4
# Init KMeans clustering model and visualiser
km =  KMeans(k, random_state=1)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
visualizer.fit(X_cluster1)        # Fit the data to the visualiser
visualizer.show()        # Finalise and render the figure
print(visualizer.silhouette_score_)

In [None]:
df_cluster3 = df_survey_attr[['Education_Level','IncomeLevel']]
df_cluster3.head()

In [None]:
X_cluster2 = pd.get_dummies(df_cluster3) #OneHot Encoding. 
X_cluster2.head()

In [None]:
km = KMeans(n_clusters=4)

In [None]:
# fitting data to sluster and creating cluster df
y_cluster = km.fit_predict(X_cluster2)
print(y_cluster[0:13])
df_cluster3['ClusterLabel'] = y_cluster 
df_cluster3.head()

In [None]:
sns.countplot(data=df_cluster3, x="ClusterLabel", hue="Education_Level")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
sns.countplot(data=df_cluster3, x="ClusterLabel", hue="IncomeLevel")
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,10))
visualizer.fit(X_cluster2)        # Fit the data to the visualizer
visualizer.show()

In [None]:
#4 clusters
k=4
# Init KMeans clustering model and visualiser
km =  KMeans(k, random_state=1)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
visualizer.fit(X_cluster2)        # Fit the data to the visualiser
visualizer.show()        # Finalise and render the figure
print(visualizer.silhouette_score_)

Prediction - Classification

In [None]:
df_1 = df_survey_attr[['Hours_Learning','Months_Programming','Age','Relocate','Area','Race','Education_Level','IncomeLevel','HighIncome']]
encoded_df = pd.get_dummies(df_1)
encoded_df.head()

In [None]:
encoded_df = encoded_df[[i for i in encoded_df if i not in ['HighIncome']] + ['HighIncome']]
encoded_df.head()

In [None]:
X = encoded_df.iloc[:,0:29]
y = encoded_df.iloc[:,29]
print(y)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3,random_state=4)

new_X_train, X_val, new_y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=4)

In [None]:
#Normilasation
knn_scaled = MinMaxScaler() 
knn_scaled.fit(X_train)
X_train = knn_scaled.transform(X_train) 
X_test = knn_scaled.transform(X_test)

In [None]:
#KNN mETHOD
KNN_modl = KNeighborsClassifier(n_neighbors=1,weights="uniform", metric="euclidean")
KNN_modl.fit(X_train, y_train)

In [None]:
#prediction and accuracy
y_predict = KNN_modl.predict(X_test) 
model_accuracy_knn = accuracy_score(y_test, y_predict)
print('KNN model accuracy ',model_accuracy_knn)

In [None]:
plot_confusion_matrix(KNN_modl, X_test, y_test,cmap=plt.cm.Blues)

KNN - manually hypertuning with holdout method

In [None]:
val_results = []
train_results = []

In [None]:
krange = range(1, 31) 
print("k range", krange)

In [None]:
for k in krange:
    clf_knn1 = KNeighborsClassifier(n_neighbors=k)
    clf_knn1 = clf_knn1.fit(new_X_train, new_y_train)
    # finding the accuracy on training data
    new_pred_train = clf_knn1.predict(new_X_train)
    train_score = metrics.accuracy_score(new_y_train, new_pred_train) 
    train_results.append(train_score)
    #finding the accuracy on validation data
    predict_val = clf_knn1.predict(X_val) 
    val_score = metrics.accuracy_score(y_val, predict_val) 
    val_results.append(val_score)

In [None]:
plt.plot(krange, val_results, 'b-', label='validation')
plt.plot(krange, train_results, 'r-', label='training') 
plt.ylabel('Score')
plt.xlabel('k')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
_knn_best1 = KNeighborsClassifier(n_neighbors=13)
_knn_best1 = _knn_best1.fit(X_train, y_train)
y_pred = _knn_best1.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
print("best model:", _knn_best1.get_params())

In [None]:
plot_confusion_matrix(_knn_best1, X_test, y_test,cmap=plt.cm.Blues)

Hypertuning with grid view method

In [None]:
clf_knn3 = KNeighborsClassifier() 
#Parameter grid
param_knn_grid = [{'weights':['uniform'], 'n_neighbors':list(range(1,30))},
               {'weights':['distance'], 'n_neighbors':list(range(1,30))}]
print(param_knn_grid)

In [None]:
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size=0.3,random_state=4)

In [None]:
# create a grid search object 
gridsearch = GridSearchCV(clf_knn3, param_knn_grid, scoring='accuracy', cv=10)                          
# now fitting the model using Grid Search for KNN
gridsearch = gridsearch.fit(X_train,y_train)

In [None]:
knn_best_gs = gridsearch.best_estimator_
print("best model:", knn_best_gs.get_params())

# Fitting the best model to the training daTa for knn modl. 
knn_best_gs.fit(X_train, y_train)

In [None]:
y_pred = knn_best_gs.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(knn_best_gs, X_test, y_test,cmap=plt.cm.Blues)

bagging the KNN model

In [None]:
# Creating KNN and bagging it
clf_knn = KNeighborsClassifier()
clf_knn_bag = BaggingClassifier(base_estimator=clf_knn,
                            n_estimators=11,
                            random_state=1)

In [None]:
clf_knn_bag = clf_knn_bag.fit(X_train, y_train)
y_train_predict = clf_knn_bag.predict(X_train)
y_test_predict = clf_knn_bag.predict(X_test)
bag_training = accuracy_score(y_train, y_train_predict)
bag_testing = accuracy_score(y_test, y_test_predict)
print('Bagging train/test accuracies %.3f/%.3f'% (bag_training, bag_testing))

In [None]:
plot_confusion_matrix(clf_knn_bag, X_test, y_test,cmap=plt.cm.Blues)

Decision Tree

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3,random_state=4)

new_X_train, X_val, new_y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=4)

In [None]:
clf_dtree = DecisionTreeClassifier(criterion='gini',max_depth=3)
clf_dtree = clf_dtree.fit(X_train,y_train)

In [None]:
y_pred = clf_dtree.predict(X_test) 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(clf_dtree, X_test, y_test,cmap=plt.cm.Blues)

Hold out method

In [None]:
max_depth_range = range(1, 16)

In [None]:
#create two lists which record results
val_result = []
train_result = []

In [None]:
for depth in max_depth_range:
    clf_dt = DecisionTreeClassifier(criterion='gini',max_depth=depth)
    clf_dt = clf_dt.fit(new_X_train, new_y_train)
    #getting the accuracy on training data
    new_pred_train = clf_dt.predict(new_X_train)
    train_score = metrics.accuracy_score(new_y_train, new_pred_train) 
    train_result.append(train_score)
    #getting the accuracy on validation data
    predict_val = clf_dt.predict(X_val) 
    val_score = metrics.accuracy_score(y_val, predict_val) 
    val_result.append(val_score)
    
print(val_results)

In [None]:
plt.plot(max_depth_range, val_result, 'g-', label='Validation')
plt.plot(max_depth_range, train_result, 'r-', label='Training')
plt.ylabel('Scores')
plt.xlabel('Model Complexities - Tree depth')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
_dt_bst1 = DecisionTreeClassifier(max_depth=2)
_dt_bst1 = _dt_bst1.fit(X_train, y_train)
y_pred = _dt_bst1.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(_dt_bst1, X_test, y_test,cmap=plt.cm.Blues)

In [None]:
#plotting the Decision-Tree with hodlout meth
fig = plt.figure(figsize=(12, 10)) 
_ = tree.plot_tree(_dt_bst1, feature_names=X.columns, class_names=["Low", "High"],  filled=True)

grid search

In [None]:
# creating the Decision Tree Classifer
clf_dt_3 = DecisionTreeClassifier() 
parameter_grid = [{'criterion':['gini'], 'max_depth':list(range(1,16))},
               {'criterion':['entropy'], 'max_depth':list(range(1,16))}]
# create a grid search object using parameters
grds = GridSearchCV(clf_dt_3, parameter_grid, scoring='accuracy', cv=10)                          
# fit model using grid search
grds = grds.fit(X_train,y_train)
 
#setting the best combo of all parameters
clf_dt_best = grds.best_estimator_
print("best model:",clf_dt_best.get_params())
# Fit the best model to the data. 
clf_dt_best = clf_dt_best.fit(X_train, y_train)

y_pred = clf_dt_best.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(clf_dt_best, X_test, y_test,cmap=plt.cm.Blues)

In [None]:
#plotting decision tree with grid search
fig = plt.figure(figsize=(12, 10)) #plot the decision tree
_ = tree.plot_tree(clf_dt_best, feature_names=X.columns, class_names=["Low", "High"],  filled=True)

Logistics regression model (lr)

In [None]:
#normalisation
lr_scaled = MinMaxScaler() 
lr_scaled.fit(X_train)
X_train = lr_scaled.transform(X_train) 
X_test = lr_scaled.transform(X_test)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
results = logreg.score(X_test, y_test)
print("Accuracy: %.2f%%" % (results*100.0))

In [None]:
plot_confusion_matrix(logreg, X_test, y_test,cmap=plt.cm.Blues)

Logistics regression with grid search

In [None]:
#parameter grid and lr model
params = {'penalty': ["l2"]}
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
grid = GridSearchCV(lr, params, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
plot_confusion_matrix(grid, X_test, y_test,cmap=plt.cm.Blues)

Ensemble Methods

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3,random_state=4)

In [None]:
#normalisation bc of KNN & LR
ens_scaled = MinMaxScaler() 
ens_scaled.fit(X_train)
X_train = ens_scaled.transform(X_train) 
X_test = ens_scaled.transform(X_test)

In [None]:
dtmodel = DecisionTreeClassifier()
knnmodel = KNeighborsClassifier()
lrmodel= LogisticRegression(max_iter=10000)
ensemble_lrn = VotingClassifier(estimators=[('dt', dtmodel), ('knn', knnmodel), ('lr', lrmodel)], voting='hard')

In [None]:
#decision tree
dtmodel = dtmodel.fit(X_train, y_train)
y_train_predicts = dtmodel.predict(X_train)
y_test_predicts = dtmodel.predict(X_test) 
training_scores = accuracy_score(y_train, y_train_predicts)
testing_scores = accuracy_score(y_test, y_test_predicts)
print('train/test accuracies %.3f/%.3f' % (training_scores, testing_scores))

In [None]:
#knn
knnmodel = knnmodel.fit(X_train, y_train)
y_train_predicts = knnmodel.predict(X_train)
y_test_predicts = knnmodel.predict(X_test) 
training_scores = accuracy_score(y_train, y_train_predicts)
testing_scores = accuracy_score(y_test, y_test_predicts)
print('train/test accuracies %.3f/%.3f' % (training_scores, testing_scores))

In [None]:
#lr
lrmodel = lrmodel.fit(X_train, y_train)
y_train_predicts = lrmodel.predict(X_train)
y_test_predicts = lrmodel.predict(X_test) 
training_scores = accuracy_score(y_train, y_train_predicts)
testing_scores = accuracy_score(y_test, y_test_predicts)
print('train/test accuracies %.3f/%.3f' % (training_scores, testing_scores))

In [None]:
# ensmeble model
ensemble_lrn = ensemble_lrn.fit(X_train, y_train)

In [None]:
y_train_predicts = ensemble_lrn.predict(X_train)
y_test_predicts = ensemble_lrn.predict(X_test) 
training_scores = accuracy_score(y_train, y_train_predicts)
testing_scores = accuracy_score(y_test, y_test_predicts)
print('train/test accuracies %.3f/%.3f' % (training_scores, testing_scores))