In [2]:
import pandas as pd
import pathlib as Path
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Part one - Prepare the Data

In [3]:
#Import the data
#Read the 'myopia.csv' file into pandas
test_file = os.path.join("Resources/myopia.csv")
myo_df = pd.read_csv(test_file)
myo_df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [4]:
#Remove the "MYOPIC" column from the dataset
myo_df['MYOPIC'].unique()

array([1, 0], dtype=int64)

In [5]:
myo_df_drop = myo_df.drop(['MYOPIC'], axis = 1)
myo_df_drop.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [8]:
#Standardize your dataset so that columns that contain larger values 
#do not influence the outcome more than columns with smaller values.

#Checking the data for any invalid fields
myo_df_drop.dtypes

AGE            int64
SPHEQ        float64
AL           float64
ACD          float64
LT           float64
VCD          float64
SPORTHR        int64
READHR         int64
COMPHR         int64
STUDYHR        int64
TVHR           int64
DIOPTERHR      int64
MOMMY          int64
DADMY          int64
dtype: object

In [9]:
myo_df_drop.isnull().sum()

AGE          0
SPHEQ        0
AL           0
ACD          0
LT           0
VCD          0
SPORTHR      0
READHR       0
COMPHR       0
STUDYHR      0
TVHR         0
DIOPTERHR    0
MOMMY        0
DADMY        0
dtype: int64

In [14]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
myo_scaled = scaler.fit_transform(myo_df_drop)

In [15]:
myo_scaled[0]

array([-0.42021911, -1.3639169 , -0.89286146,  0.48378402, -0.28144315,
       -1.0197916 ,  4.1506609 ,  1.69744958, -0.68931054, -0.67299591,
        0.18405806,  0.49830393,  0.98713773,  1.0032415 ])

# Part 2: Apply Dimensionality Reduction

In [13]:
#Perform dimensionality reduction with PCA. How did the number of the features change?
#...For this assignment, preserve 90% of the explained variance in dimensionality reduction.

#Initialize PCA model
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.9)

In [17]:
#Get principal components for 'myo_scaled' data
myo_pca = pca.fit_transform(myo_scaled)

In [18]:
#Fetch the explained variance
pca.explained_variance_ratio_

array([0.21177355, 0.15659716, 0.08688023, 0.08301762, 0.07584858,
       0.06997878, 0.06486986, 0.06377808, 0.05393666, 0.05205566])

In [19]:
pca.explained_variance_ratio_.sum() 

0.9187361702915186

In [None]:
#Further reduce the dataset dimensions with t-SNE and visually inspect the results. 
#To do this, run t-SNE on the principal components, which is the output of the PCA transformation.



In [None]:
#Create a scatter plot of the t-SNE output. Are there distinct clusters?