### Here we perform the data cleaning 

In [None]:
import numpy as np,pandas as pd,matplotlib.pyplot as plt,seaborn as sns,warnings

from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Social_Network_Ads_Altered.csv")
df.sample(8)

## Data Cleaning --> EDA --> Feature Engineering   

In [None]:
# how do we find the null values in the Data Frame 

df.isnull().sum()

In [None]:
# Percentage of Values --> in the DF

(df.isnull().sum() / len(df)) * 100

# lets do a basic math --> in 400 --> 10% means --> 40 --> 5% means 20 rows 

In [None]:
df.shape 

In [None]:
# dropping all the Rows with NAN Values --> Method - 1 -->  ( when u have more data) --> (CCA)

# df = df.dropna()

In [None]:
# Method -2 (Limited Data )

df.dropna(subset={'Gender','Purchased','EstimatedSalary'},inplace=True)

In [None]:
df.isnull().sum() 

In [None]:
# impute or fill the NAN values with mean ,median,arbitary value, End of Distribution  .... etc ! 
# Ways to impute the NAN values 1) Pandas 2) Scikitlearn

In [None]:
mean =np.round(df['Age'].mean(),0)
mean

In [None]:
df['Age'] = df['Age'].fillna(mean)

In [None]:
df.info()

### Outlier's Detection 

In [None]:
X = df.iloc[:,:4]
y = df.Purchased

In [None]:
X.head()

In [None]:
y.head()

In [None]:
int_columns = X.select_dtypes('float64')
int_columns

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(1,2,1)
sns.boxplot(X['Age'])
plt.subplot(1,2,2)
sns.boxplot(X['EstimatedSalary'])

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(1,2,1)
sns.distplot(X['Age'])
plt.subplot(1,2,2)
sns.distplot(X['EstimatedSalary'])

## Feature Engineering 
* Feature Construction 

* Feature Encoding 

* Feature Scaling 

* Feature Extraction 

##### Feature Construction --> This Technique is preferred when you  have some Domain Knowledge related to the Data your working with 

In [None]:
# Feature Construction --> 
titanic_df = pd.read_csv("Titanic-Dataset.csv")
titanic_df.head()


In [None]:
titanic_df.shape

In [None]:
titanic_df['Family'] = titanic_df.SibSp + titanic_df.Parch 
titanic_df.sample(8)

In [None]:
titanic_df.shape

In [None]:
# Dropping the Un-Necessary Columns 

titanic_df.drop(columns=['SibSp','Parch'],inplace=True)
titanic_df.head()


In [None]:
titanic_df.shape

### Feature Encoding 
* Ordinal Encoding 
* Nominal Encoding / One Hot Encoding  --> Scikit Learn (or) Pandas 
* Label Encoding 

In [None]:
Customer_df = pd.read_csv('customer.csv')
Customer_df.head()

In [None]:
X = Customer_df.iloc[:,:4]
Y = Customer_df.purchased

In [None]:
print(X.head())
print("*" * 80)
print(Y.head())

In [None]:
print(X['review'].unique())
print(X['education'].unique())
print(X['gender'].unique())

In [None]:
Customer_df['purchased'].unique() 

In [None]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [None]:
trf = ColumnTransformer(transformers=[
    ('Ohe-Encoding',OneHotEncoder(sparse=False,drop='first'),['gender']),
    ('ordinal-Encoding-review',OrdinalEncoder(categories=[['Poor','Average','Good']]),['review']),
    ('ordinal-Encoding-education',OrdinalEncoder(categories=[['School','UG','PG']]),['education']),
],remainder='passthrough')

In [None]:
X_Encoded = trf.fit_transform(X)
X_Encoded

In [None]:
X_df = pd.DataFrame(data = X_Encoded,columns=['Gender','Review','Education','Age'])
X_df.head()

In [None]:
## Label Encoding  --> Mostly prefered for encoding the target columns 
label_encoder = LabelEncoder()

Y_encoded = label_encoder.fit_transform(Y)

Y_encoded

## Featur Scaling 
* Standardization / __Standard Scaler__


In [None]:
data = pd.read_csv('Social_Network_Ads.csv')

data.head()

In [None]:
data = data.drop(columns=['User ID','Gender'])
data.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Purchased']), data['Purchased'], test_size=0.2, random_state=24)


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled_df = pd.DataFrame(data = X_train_scaled,columns=X_train.columns)

X_test_scaled_df = pd.DataFrame(data = X_test_scaled,columns=X_test.columns)

In [None]:
X_train_scaled_df.head()

In [None]:
X_train.describe()

In [None]:
X_train_scaled_df.describe()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))
ax1.scatter(X_train['Age'],X_train['EstimatedSalary'],color='red')
ax1.set_title(' Data Before Scaling')
ax2.scatter(X_train_scaled_df['Age'],X_train_scaled_df['EstimatedSalary'],color='green')
ax2.set_title(' Data After Scaling')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))
sns.kdeplot(X_train['Age'],ax=ax1)
sns.kdeplot(X_train['EstimatedSalary'],ax=ax1)
ax1.set_title('Data Before Scaling')
sns.kdeplot(X_train_scaled_df['Age'],ax=ax2)
sns.kdeplot(X_train_scaled_df['EstimatedSalary'],ax=ax2)
ax1.set_title('Data After Scaling')

* Normalization  /  __Minmax Scaler__

In [None]:
wine_df = pd.read_csv('wine_data.csv',header=None,usecols=[0,1,2])
column=['Class label', 'Alcohol', 'Malic acid']
wine_df.columns = column
wine_df 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    wine_df.drop(columns=["Class label"]),
    wine_df["Class label"],
    test_size=0.2,
    random_state=24
)


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
X_train.describe()

In [None]:
X_train_scaled.describe()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

ax1.scatter(X_train['Alcohol'], X_train['Malic acid'],c=y_train)
ax1.set_title("Before Scaling")
ax2.scatter(X_train_scaled['Alcohol'], X_train_scaled['Malic acid'],c=y_train)
ax2.set_title("After Scaling")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Alcohol'], ax=ax1)
sns.kdeplot(X_train['Malic acid'], ax=ax1)

# after scaling
ax2.set_title('After  Scaling')
sns.kdeplot(X_train_scaled['Alcohol'], ax=ax2)
sns.kdeplot(X_train_scaled['Malic acid'], ax=ax2)
plt.show()

## Feature Transformation 
* Function Transformer
    1. Log Transform
    2. Square Transform
    3. Square Root Transform
    4. Reciprocal Transform
    5. Custom Transform


* Power Transformer

In [None]:
import scipy.stats as stats
from sklearn.compose  import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [None]:
data = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])
data.head()

In [None]:
data.fillna(data['Age'].mean(),inplace=True)

In [None]:
X= data.iloc[:,1:3]
y = data.iloc[:,-3]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

In [None]:

plt.figure(figsize=(8,8))



plt.subplot(2,2,1)
sns.distplot(X_train['Age'],color='r',kde=True,hist=False)





plt.subplot(2,2,2)
stats.probplot(X_train['Age'],dist='norm',plot=plt)
plt.title('QQ')

plt.subplot(2,2,3)
sns.distplot(X_train['Fare'],color='r',kde = True,hist = False)



plt.subplot(2,2,4)
stats.probplot(X_train['Fare'],dist='norm',plot=plt)
plt.title('QQ')

plt.show()

In [None]:
clmn_trf = ColumnTransformer(transformers=[
('fare',FunctionTransformer(func=np.sqrt),['Fare']) 
],remainder="passthrough")

In [None]:
X_train_clmn_transformed = clmn_trf.fit_transform(X_train) 
X_test_clmn_transformed = clmn_trf.transform(X_test)

In [None]:
transformed_X_train_df = pd.DataFrame(data = X_train_clmn_transformed,columns=['Fare','Age'])
transformed_X_train_df.sample(5)

In [None]:
plt.figure(figsize=(14,7)) 
plt.subplot(121)
stats.probplot(X_train['Fare'],dist="norm",plot=plt)
plt.title('Fare before transformed')

plt.subplot(122) 
stats.probplot(transformed_X_train_df['Fare'],dist="norm",plot=plt)
plt.title('Fare after transformed')

In [None]:
plt.figure(figsize=(14,7)) 
plt.subplot(121)
sns.distplot(X_train['Fare'])
plt.title('Fare before transformed')

plt.subplot(122) 
sns.distplot(transformed_X_train_df['Fare'])
plt.title('Fare after transformed')

#### Gratitude for the journey so far – thanks for being a constant companion on this road of learning. Together, we've gained not just knowledge but also a deep intuition for the wonders ahead." 


## Dont forget to ckeck the resources in the Presentation 