In [None]:
pip install -U https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [None]:
df = pd.read_csv('winemag-data_first150k.csv', sep = ';')

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report")
profile

In [None]:
print(df.isnull().sum())

In [None]:
df.head()

# **Imputation**

In [None]:
new_df = df.copy()
new_df['price'].fillna(df['price'].mean(), inplace = True)

In [None]:
print(new_df.isnull().sum())

In [None]:
df.isnull().mean()

In [None]:
threshold = 0.5
new_df = df[df.columns[df.isnull().mean() < threshold]]

In [None]:
new_df.isnull().mean()

In [None]:
print(df.shape)

new_df = df.loc[df.isnull().mean(axis=1) < threshold]

print(new_df.shape)

In [None]:
print(new_df.isnull().sum())

In [None]:
print(df.median())
new_df = df.fillna(df.median())

In [None]:
print(new_df.isnull().sum())

In [None]:
new_df = df.fillna(0)

In [None]:
print(new_df.isnull().sum())

In [None]:
new_df.head()

In [None]:
print(df.shape)

new_df = df.dropna(how='any')

print(new_df.shape)

# **Handling Outliers**

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt 

In [None]:
fig = plt.figure(figsize=(12,8))
sns.boxplot(x=df['price'], color='lime')
plt.xlabel('Price Featured', fontsize=14)
plt.savefig('boxplot.png', dpi=300)

In [None]:
df['price'].describe()

**Drop Outlier with Standard Deviation**

In [None]:
print(df.shape)

factor = 3
upper_lim = df['price'].mean () + df['price'].std () * factor
lower_lim = df['price'].mean () - df['price'].std () * factor

drop_outlier1 = df[(df['price'] < upper_lim) & (df['price'] > lower_lim)]

print(drop_outlier1.shape)

In [None]:
fig = plt.figure(figsize=(12,8))
sns.boxplot(x=drop_outlier1['price'], color='lime')
plt.xlabel('Price Featured', fontsize=14)
plt.savefig('boxplot.png', dpi=300)

In [None]:
drop_outlier1['price'].describe()

In [None]:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
test_df = pd.DataFrame(a, columns=['A'])
test_df

In [None]:
test_df[(test_df['A'] < 8) & (test_df['A'] > 5)]

**Drop with Percentiles**

In [None]:
print(df.shape)

upper_lim = df['price'].quantile(.95)
lower_lim = df['price'].quantile(.05)

drop_outlier2 = df[(df['price'] < upper_lim) & (df['price'] > lower_lim)]

print(drop_outlier2.shape)

In [None]:
fig = plt.figure(figsize=(12,8))
sns.boxplot(x=drop_outlier2['price'], color='lime')
plt.xlabel('Price Featured', fontsize=14)
plt.savefig('boxplot.png', dpi=300)

In [None]:
drop_outlier2['price'].describe()

# **Binning**

In [None]:
labels = ['low', 'mid', 'high']
bins = [0., 20., 40., 100.]

drop_outlier2['price_cat'] = pd.cut(drop_outlier2['price'], labels=labels, bins=bins, include_lowest=False)

In [None]:
drop_outlier2.sample(n=5).head()

# **Log Transform**

In [None]:
ax = drop_outlier2['price'].plot.hist(bins=12, alpha=0.5)
ax.figure.savefig('his.png', dpi=300)

In [None]:
drop_outlier2['log'] = drop_outlier2['price'].transform(np.log)

In [None]:
ax = drop_outlier2['log'].plot.hist(bins=12, alpha=0.5)
ax.figure.savefig('his.png', dpi=300)

In [None]:
drop_outlier2.sample(n=5).head()

# **One-hot Encoding**

In [None]:
encoded_columns = pd.get_dummies(drop_outlier2['price_cat'])
drop_outlier2 = drop_outlier2.join(encoded_columns)

In [None]:
drop_outlier2.sample(n=5).head()