# Tasks   
    Task 1: Read given data into DataFrame in python “Cat_Human.csv”. Perform Data cleaning.
    Task 2: After data cleaning, you are required to prepare your dataset for training.
    • Separate features and labels.
    • Feature scaling/Normalization
    • Perform Label Encoding
    • Split dataset into training and testing data
    Task 3: Display confusion matrix and generate report of f1-score, recall and precision.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder ,MinMaxScaler ,StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import accuracy_score ,f1_score
import joblib

## Task 1:
    Read given data into DataFrame in python “Cat_Human.csv”. Perform Data cleaning.

In [2]:
## Import csv file using pandas
df= pd.read_csv("../DataSets/Cat_human.csv")
df

Unnamed: 0,Color,Eye_color,Height,Legs,Moustache,Tail,Weight,label
0,No,black,5.14,2,No,No,70.000000,human
1,No,brown,6.80,2,No,No,64.400000,human
2,Yes,brown,5.00,2,Yes,No,64.800000,human
3,No,blue,5.90,2,No,No,78.800000,human
4,No,blue,6.56,2,No,No,73.200000,human
...,...,...,...,...,...,...,...,...
195,brown,gray,1.14,4,Yes,Yes,2.304511,Cat
196,white,yellow,1.39,4,Yes,Yes,5.687970,Cat
197,white,black,0.53,4,Yes,Yes,6.364662,Cat
198,brown,green,1.03,4,Yes,Yes,6.590226,Cat


In [3]:
## Check for missing values or nan values to see if Data Cleaning is necessary

df.isna().sum()

Color        0
Eye_color    0
Height       0
Legs         0
Moustache    0
Tail         0
Weight       0
label        0
dtype: int64

In [4]:
## check for duplicate rows 
df.duplicated().sum()

0

In [5]:
## check for duplicate rows 
df.T.duplicated().sum()

0

In [6]:
## Well there are no duplicates but we can do this just for convention and repeatition

df.drop_duplicates(inplace= False)  ## will drop duplicate in row
df.T.drop_duplicates(inplace= False).T  ## will drop duplicates in Column

Unnamed: 0,Color,Eye_color,Height,Legs,Moustache,Tail,Weight,label
0,No,black,5.14,2,No,No,70.0,human
1,No,brown,6.8,2,No,No,64.4,human
2,Yes,brown,5.0,2,Yes,No,64.8,human
3,No,blue,5.9,2,No,No,78.8,human
4,No,blue,6.56,2,No,No,73.2,human
...,...,...,...,...,...,...,...,...
195,brown,gray,1.14,4,Yes,Yes,2.304511,Cat
196,white,yellow,1.39,4,Yes,Yes,5.68797,Cat
197,white,black,0.53,4,Yes,Yes,6.364662,Cat
198,brown,green,1.03,4,Yes,Yes,6.590226,Cat


## Task 2: 
         After data cleaning, you are required to prepare your dataset for training.

In [8]:
# before splitting the apply label encoder or OneHotEncoder

x = df.drop("label" ,axis = 1)
y = df["label"]

label_encoder = LabelEncoder( )

y_encoded=label_encoder.fit_transform(y)

numeric_transformer = Pipeline(steps = [("scaler",MinMaxScaler())])
category_transformer = Pipeline(steps = [ ("encoder",OneHotEncoder()) ])


category = ["Color","Eye_color","Moustache","Tail"]
numerical = ["Height","Weight"]

preprocessor  = ColumnTransformer([("category",category_transformer,category),
                                 ("numerical",numeric_transformer,numerical)],remainder='passthrough')

pipeline = Pipeline([("Preprocessor",preprocessor),
                     ("classifier",LogisticRegression())])

x_train ,x_test ,y_train ,y_test = train_test_split(x ,y_encoded ,test_size=0.2 ,random_state = 42)

pipeline.fit(x_train , y_train)


In [9]:
## model score
pipeline.score(x_test ,y_test)
y_preds = pipeline.predict(x_test)
y_preds

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1])

In [10]:
## accuracy Score
accuracy_score(y_test , y_preds)

1.0

In [11]:
##
f1_score(y_test , y_preds)

1.0

In [12]:
## model deployment

joblib.dump(pipeline , "../TrainedModels/modelPipeline20june.pkl")
joblib.dump(label_encoder , "../TrainedModels/LabelEncoder20june.pkl")



['../TrainedModels/LabelEncoder20june.pkl']

In [13]:
## import model file

pipeline = joblib.load("../TrainedModels/modelPipeline20june.pkl")
label_encoder = joblib.load("../TrainedModels/LabelEncoder20june.pkl")


In [14]:
# Set handle_unknown='ignore' for the label encoder
label_encoder.handle_unknown = 'ignore'
print(label_encoder.classes_)


# Create a dictionary to map the label encoder classes to their original labels
class_map  = {i: label for i,label in enumerate(label_encoder.classes_)}
class_map

['Cat' 'human']


{0: 'Cat', 1: 'human'}

In [15]:
# now model will predict based on input data provided by user


color = input("Enter the color: ")
eye_color = input("Enter the eye color: ")
height = float(input("Enter the height: "))
legs = int(input("Enter the number of legs: "))
moustache = input("Does it have a moustache (Yes/No): ")
tail = input("Does it have a tail (Yes/No): ")
weight = float(input("Enter the weight: "))
# Create a DataFrame with the user input
data = pd.DataFrame({
'Color': [color],
'Eye_color': [eye_color],
'Height': [height],
'Legs': [legs],
'Moustache': [moustache],
'Tail': [tail],
'Weight': [weight]
})

# predict the data 
transform_data = pipeline["Preprocessor"].transform(data)

predict = pipeline["classifier"].predict(transform_data)
print("The Label is :" , class_map[predict[0]])



Enter the color: brown
Enter the eye color: gray
Enter the height: 2
Enter the number of legs: 4
Does it have a moustache (Yes/No): No
Does it have a tail (Yes/No): Yes
Enter the weight: 3
The Label is : Cat
