In [34]:
import os
import sys
sys.path.append('../')
from settings import PROJECT_ROOT

import pandas as pd
import qgrid

import numpy as np 

%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot

import seaborn as sns

from scipy import stats


In [3]:
filename = 'file_test.csv'
data_path = os.path.join(PROJECT_ROOT, 'data', filename)
df = pd.read_csv(data_path, index_col=0)
qgrid = qgrid.show_grid(df,show_toolbar=True)

In [9]:
qgrid


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [13]:
qgrid

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [10]:
changed_df = qgrid.get_changed_df()

DATA FRAME PREPROCESSING

In [15]:
df.describe()

Unnamed: 0,y
count,2.0
mean,0.5
std,0.707107
min,0.0
25%,0.25
50%,0.5
75%,0.75
max,1.0


In [16]:
df.info

<bound method DataFrame.info of        x  y
0  'ale'  1
1  'ehg'  0>

In [None]:
df.replace("?", np.nan, inplace = True)

In [14]:
df.dtypes

x    object
y     int64
dtype: object

In [17]:
missing_data = df.isnull()
missing_data

Unnamed: 0,x,y
0,False,False
1,False,False


In [None]:
df.dfropna(subset=['column_name'], axis=0)


In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")   

REPLACE MISSING VALUE BY MEAN

In [None]:
column_name = None 
avg_norm = df[column_name].astype("float").mean(axis=0)
print("Average of normalized-column_name:", avg_norm)
df[column_name].replace(np.nan, avg_norm, inplace=True)

REPLACE MISSING VALUE BY MOST FREQUENT ELEMENT 


In [24]:
most_freq_el = df[column_name].value_counts().idxmax()
print("most freq el in {}: {}".format(column_name, most_freq_el))
df[column_name].replace(np.nan, most_freq_el, inplace=True)

most freq el in x: 'ale'


NORMALIZATION


In [None]:
# simple feature scalling
df[column_name] = df[column_name]/df[column_name].max()
# MIN-Max scaling
df[column_name] = (df[column_name] - df[column_name].min()) / 
                (df[column_name].max() - df[column_name].min())
# Z-score
df[column_name] = (df[column_name] - df[column_name].mean()) / 
                 df[column_name].std()

BINNING


In [None]:
df[column_name]=df[column_name].astype(int, copy=True)
plt.pyplot.hist(df[column_name])
# set x/y labels and plot title
plt.pyplot.xlabel(column_name)
plt.pyplot.ylabel("count")
plt.pyplot.title("{} bins".format(column_name))

In [30]:
bins = np.linspace(min(df[column_name]), max(df[column_name]), 4)
group_names = ['Low', 'Medium', 'High']
binned_column_name = '{}-binned'.format(column_name) 
df[] = pd.cut(df[column_name], bins, labels=group_names, include_lowest=True)
pyplot.bar(group_names, df[binned_column_name].value_counts())
# set x/y labels and plot title
plt.pyplot.xlabel(column_name)
plt.pyplot.ylabel("count")
plt.pyplot.title("{} bins".format(column_name))


TypeError: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')

In [None]:
GET ONE HOT ENCODING 


In [None]:
dummy_variable_1 = pd.get_dummies(df[column_name])
dummy_variable_1.head()

In [None]:
# merge data frame "df" and "dummy_variable_1" 
df = pd.concat([df, dummy_variable_1], axis=1)
df.drop(column_name, axis = 1, inplace=True)

EXPLORATORY DATA ANALYSIS


In [None]:
df.describe()
# df.describe(include=['object'])

In [None]:
df.corr()

In [35]:
column1_name = 'x'
column2_name = 'y'

In [None]:
sns.regplot(x=column1_name, y=column2_name, data=df)
plt.ylim(0,)

In [None]:
df[[column1_name, column2_name]]

In [None]:
# VALUE COUNTS
# categorical variable name
categorical_column = 'c_n'
engine_loc_counts = df[categorical_column].value_counts().to_frame()
engine_loc_counts.rename(columns={categorical_column: 'value_counts'}, inplace=True)
engine_loc_counts.index.name = categorical_column 
# engine_loc_counts.head(10)

In [None]:
# GROUP BY
# grouping results
cat_col1 = 'name1'
cat_col2 = 'name2'
cont_col = 'name3'
df_gptest = df[[cat_col1, cat_col2,cont_col]]
grouped_test1 = df_gptest.groupby([cat_col1, cat_col2],as_index=False).mean()
grouped_test1

In [None]:
# PIVOT FORM
grouped_pivot = grouped_test1.pivot(index='drive-wheels',columns='body-style')
grouped_pivot

# grouped_pivot = grouped_pivot.fillna(0) #fill missing values with 0
# grouped_pivot

In [None]:
def compute_pearrson_corr(df, dependent_col)
    computed_p_coef_p_value_pairs = dict() 
    for column in df.columns:
        if column is not dependent_col:
            pearson_coef, p_value = stats.pearsonr(df[column], df[dependent_col])
            computed_p_coef_p_value_pairs[column] = (pearson, p_value)

In [None]:
# ANALYSIS OF VARIANCE - ANOVA - for categorical variables
grouped_test2=df_gptest[['drive-wheels', 'price']].groupby(['drive-wheels'])
# ANOVA
f_val, p_val = stats.f_oneway(grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price'], grouped_test2.get_group('4wd')['price'])  
 
print( "ANOVA results: F=", f_val, ", P =", p_val) 


Visualization


In [36]:
column1_name = 'column_1'
column2_name = 'column_2'

In [None]:
sns.boxplot(x=column1_name, y=column2_name, data=df)


In [None]:
fig, ax = plt.subplots()
im = ax.pcolor(grouped_pivot, cmap='RdBu')

#label names
row_labels = grouped_pivot.columns.levels[1]
col_labels = grouped_pivot.index

#move ticks and labels to the center
ax.set_xticks(np.arange(grouped_pivot.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(grouped_pivot.shape[0]) + 0.5, minor=False)

#insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)

#rotate label if too long
plt.xticks(rotation=90)

fig.colorbar(im)
plt.show()