In [1]:
#im neccessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objs as go
import plotly .offline as offline
import plotly.figure_factory as ff
import umap 

**LOADING AND EXAMINING DATASET**

In [2]:
##LOAD THE DATA
df=pd.read_csv('sign_mnist.csv')

In [3]:
#view first five data sets
print(df.head(5))

   label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0      3     107     118     127     134     139     143     146     150   
1      6     155     157     156     156     156     157     156     158   
2      2     187     188     188     187     187     186     187     188   
3      2     211     211     212     212     211     210     211     210   
4     13     164     167     170     172     176     179     180     184   

   pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0     153  ...       207       207       207       207       206       206   
1     158  ...        69       149       128        87        94       163   
2     187  ...       202       201       200       199       198       199   
3     210  ...       235       234       233       231       230       226   
4     185  ...        92       105       105       108       133       163   

   pixel781  pixel782  pixel783  pixel784  
0       206       204       20

In [4]:
#view last five datasets
print(df.tail(5))

      label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
9995     23     152     153     155     157     158     158     158     159   
9996     10      45      46      50      50      58      76      92     105   
9997     24     128     133     135     138     142     148     152     155   
9998      5     164     165     167     169     169     170     170     170   
9999     22     146     150     152     156     161     165     168     171   

      pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
9995     160  ...       217       217       217       216       216       215   
9996     110  ...        63        60        58        91        98        97   
9997     159  ...       216       216       215       215       215       214   
9998     171  ...       198       192       192       192       190       189   
9999     173  ...       190       180       181       181       181       180   

      pixel781  pixel782  pixel783  pi

In [5]:
##view the shape of dataset
print(df.shape)

(10000, 785)


In [6]:
##get a concise summary of the dataframe
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 59.9 MB
None


In [7]:
##compute a summary of statistics pertaining to the DataFrame columns using describe()
print(df.describe())

              label        pixel1        pixel2        pixel3        pixel4  \
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
mean      12.310400    145.790700    148.925500    151.665400    153.926200   
std        7.294599     41.912797     40.381637     39.506239     39.041027   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        6.000000    122.000000    126.000000    130.000000    134.000000   
50%       13.000000    151.000000    153.000000    156.000000    158.000000   
75%       19.000000    175.000000    177.000000    178.000000    180.000000   
max       24.000000    255.000000    255.000000    255.000000    255.000000   

             pixel5        pixel6        pixel7        pixel8       pixel9  \
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.00000   
mean     156.552100    158.785700    160.857700    162.736700    164.38930   
std       37.581143     36.483754     35.328474     34

**DIVIDING DATASET INTO TARGET AND FEATURE VARIABLES**

In [8]:
##target variable(depenednt variable, target)
target=df['label']

In [9]:
#feature variables(independent variables, Y)
X=df.drop(['label'],axis=1)

In [10]:
print(type(target))
print(type(X))
print(target.shape)
print(X.shape)

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
(10000,)
(10000, 784)


**NORMALIZE NUMERICAL FEATURES**

In [11]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

**IMPLENENTING DIMENSIONALITY REDUCTION**

In [12]:
##Implementing UMAP to visualize dataset
u = umap.UMAP(n_neighbors=15, min_dist=0.1)
x_umap = u.fit_transform(X_scaled)

In [13]:
##convert label from series to list
Sign=list(df['label'])

In [14]:
#implementing UMAP
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=target, colorscale='Rainbow', opacity=0.5),
                                text=[f'Sign: {a}' for a in Sign],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Signs', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
offline.plot(fig,filename='Signs.html')


'Signs.html'

In [16]:
#implementing TSNE
tsne = TSNE(n_components = 2, perplexity = 20, n_iter = 2000)
x_tsne = tsne.fit_transform(X_scaled)

data = [go.Scatter(x=x_tsne[:,0], y=x_tsne[:,1], mode='markers',
                    marker = dict(color=target, colorscale='Rainbow', opacity=0.5),
                                text=[f'Sign: {a}' for a in Sign],
                                hoverinfo='text')]

layout = go.Layout(title = 't-SNE Signs', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
offline.plot(fig,filename='t-SNE Signs.html')


't-SNE Signs.html'