In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("data\modeldata.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Area,X,Y,XM,YM,Perimeter,BX,BY,Width,Height,Result,Type
0,0,94.3,131.7,16.1,131.7,16.1,36.8,125.9,10.2,11.5,11.5,no,no
1,1,89.1,233.0,16.7,233.0,16.7,36.5,227.8,11.1,10.6,11.5,no,no
2,2,90.9,156.5,17.0,156.5,17.0,35.5,151.2,11.3,10.6,11.3,no,no
3,3,79.7,207.5,16.7,207.5,16.7,33.7,202.3,11.3,10.2,10.6,yes,small
4,4,86.7,80.9,17.6,80.9,17.6,35.2,75.5,11.9,10.8,10.8,no,no


In [6]:
df = df.drop(columns=['Unnamed: 0','Type'])

In [7]:
df.head()

Unnamed: 0,Area,X,Y,XM,YM,Perimeter,BX,BY,Width,Height,Result
0,94.3,131.7,16.1,131.7,16.1,36.8,125.9,10.2,11.5,11.5,no
1,89.1,233.0,16.7,233.0,16.7,36.5,227.8,11.1,10.6,11.5,no
2,90.9,156.5,17.0,156.5,17.0,35.5,151.2,11.3,10.6,11.3,no
3,79.7,207.5,16.7,207.5,16.7,33.7,202.3,11.3,10.2,10.6,yes
4,86.7,80.9,17.6,80.9,17.6,35.2,75.5,11.9,10.8,10.8,no


In [8]:
df['Result'].value_counts()

Result
yes    421
no      79
Name: count, dtype: int64

In [9]:
df['Result'].unique()

array(['no', 'yes'], dtype=object)

In [10]:
df['Result'] = df['Result'].replace('no','0')
df['Result'] = df['Result'].replace('yes','1')

In [11]:
df['Result'].unique()

array(['0', '1'], dtype=object)

In [12]:
df['Result'].value_counts()

Result
1    421
0     79
Name: count, dtype: int64

In [13]:
## Independent and dependent features
X = df.drop(columns="Result",axis=1)
Y = df[["Result"]]

In [14]:
## Define which column should be ordinal encoded and which should be scaled
## Spliting X data
numerical_col = X.select_dtypes(exclude="object").columns

In [15]:
print(numerical_col)

Index(['Area', 'X', 'Y', 'XM', 'YM', 'Perimeter', 'BX', 'BY', 'Width',
       'Height'],
      dtype='object')


In [16]:
## using this library to automate the handling of missing value it will fill all missing value with (mean,median and mode)
from sklearn.impute import SimpleImputer

## using this library for feature scaling 
from sklearn.preprocessing import StandardScaler

## using this library for converting catgorical features into numerical features
## whenever our catorical features have rank we use OrdinalEncoder, else we use OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

## using library for pipeline, we need to perform simpleImputer then pass the result to StandartScaler then pass the result to Encoder
## pipeline just combines the step
## pipline is just connecting
from sklearn.pipeline import Pipeline

## now the connection is done we need to group this 
## for that we use library 
from sklearn.compose import ColumnTransformer

In [17]:
# Creating pipeline

# Numerical pipeline

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

# Now we have two pipelines lets combine it 
preprocessor = ColumnTransformer(
    [
    ('numPipeline',num_pipeline,numerical_col)
    ]
)

In [18]:
preprocessor

In [19]:
# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=30)

In [20]:
print(X_train,y_train,X_test,y_test)

      Area      X      Y     XM     YM  Perimeter     BX     BY  Width  Height
12    87.6  206.4   40.6  206.4   40.6       35.4  201.2   35.1   10.6    10.6
136  108.1  155.5   91.2  155.5   91.2       39.7  149.9   84.9   11.9    12.1
368   74.7  128.0   88.0  128.0   88.0       34.1  123.2   83.2   11.1     9.7
367   76.6  114.6   88.3  114.6   88.3       34.2  109.7   83.2   10.8     9.9
406   70.9  131.6   10.6  131.6   10.6       35.5  126.6    5.8   10.0    10.0
..     ...    ...    ...    ...    ...        ...    ...    ...    ...     ...
140  109.2  271.0  106.4  271.0  106.4       39.5  265.2  100.9   12.6    11.2
301   86.8   12.1    7.5   12.1    7.5       39.3    6.7    2.2   12.2    10.0
429   76.2  131.2   54.2  131.2   54.2       40.1  126.2   49.3    9.8    12.0
421   76.2  195.4   47.8  195.4   47.8       43.2  190.3   42.9   10.1    11.3
293   76.2  118.6  129.2   84.4  129.2       33.5   84.4  148.3   10.4    10.5

[350 rows x 10 columns]     Result
12       0
136  

In [21]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [23]:
models={
    "random_forest":RandomForestClassifier(oob_score=True),
    "logistic_regression":LogisticRegression(),
    "decision_tree":DecisionTreeClassifier()
}

In [24]:
from sklearn.metrics import accuracy_score
def evaluate_model(X_train,y_train,X_test,y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        print(model.score(X_train,y_train))
        #print("OOB score:", model.oob_score_)
        
        y_pred=model.predict(X_test)
        
        accuracy=round(accuracy_score(y_test,y_pred)*100,2)
        
        report[list(models.keys())[i]]=accuracy
    
    return report

In [25]:
evaluate_model(X_train,y_train.values.ravel(),X_test,y_test,models)  ## using values ravel beacause it was giving warning for 1d array

1.0
0.8342857142857143
1.0


{'random_forest': 82.67, 'logistic_regression': 84.67, 'decision_tree': 74.0}

In [26]:
params={
    
    "n_estimators":[50,100,200],
    "criterion":["gini","entropy"],
    "max_depth":[3,5,10]
    
}

In [27]:
model=RandomForestClassifier(oob_score=True)

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
cv=RandomizedSearchCV(model,param_distributions=params,scoring='accuracy',cv=5,verbose=3)

In [30]:
cv.fit(X_train,y_train.values.ravel())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.829 total time=   0.0s


[CV 2/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.843 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.829 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.829 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.857 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=100;, score=0.843 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=100;, score=0.843 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=100;, score=0.829 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=100;, score=0.843 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=100;, score=0.843 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=50;, score=0.814 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=50;, score=0.886 total time=  

In [31]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 3, 'criterion': 'gini'}

In [32]:
bestmodel=RandomForestClassifier(n_estimators=50,max_depth=3,criterion='gini')

In [33]:
bestmodel.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [34]:
y_pred=bestmodel.predict(X_test)

In [35]:
best_accuracy=round(accuracy_score(y_test,y_pred)*100,2)

In [36]:
best_accuracy

86.0