# Main Notebook
### Include:
 - <b>datasets</b> folder - folder that holds the files of the different datasets
### Imports:
 - <b>paths</b> - object that contain paths to datasets in datasets folder
 - <b>DataPreprocessor</b> - class for cleaning data, fill missing values, encoding, detect outliers with IQR, split to train/test and more... 
 - <b>Utils</b> - class for upload csv and download from kaggle function
 - <b>ClassifierModel</b> - class for train, predict, evaluate and more...

In [1]:
from paths import datasets
from data_preprocessor import DataPreprocessor
from utils import Utils
from classifier_model import ClassifierModel

In [2]:
df = Utils.load_csv(datasets["titanic"])
dp = DataPreprocessor(df)
dp.explore_data()

First 5 rows:
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

In [3]:
dp.fill_missing()

apply median to column PassengerId
apply median to column Survived
apply median to column Pclass
apply mode to column Name
apply mode to column Sex
apply median to column Age
apply median to column SibSp
apply median to column Parch
apply mode to column Ticket
apply median to column Fare
apply mode to column Cabin
apply mode to column Embarked
fill_missing terminated successfully


In [4]:
dp.detect_outliers_iqr()

detect_outliers_iqr terminated successfully


In [5]:
dp.encode_categorical()

encode_categorical terminated successfully


In [6]:
dp.scale_features()

scale_features terminated successfully


In [7]:
clean_df = dp.get_clean_data()
clean_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,PassengerId_outlier,Survived_outlier,Pclass_outlier,...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
0,-1.730108,-0.789272,0.827377,-0.565736,0.432793,-0.473674,-0.502445,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,True
1,-1.72622,1.26699,-1.566107,0.663861,0.432793,-0.473674,0.786845,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,-1.722332,1.26699,0.827377,-0.258337,-0.474545,-0.473674,-0.488854,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,True
3,-1.718444,1.26699,-1.566107,0.433312,0.432793,-0.473674,0.42073,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,True
4,-1.714556,-0.789272,0.827377,0.433312,-0.474545,-0.473674,-0.486337,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,True


In [8]:
X_train, X_test, y_train, y_test = dp.split_to_train_test(target_column="Survived")
X_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,PassengerId_outlier,Survived_outlier,Pclass_outlier,Age_outlier,...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
331,-0.443219,-1.566107,1.240235,-0.474545,-0.473674,-0.074583,0.0,0.0,0.0,-0.282843,...,False,False,False,False,False,False,False,False,False,True
733,1.11971,-0.369365,-0.488887,-0.474545,-0.473674,-0.386671,0.0,0.0,0.0,-0.282843,...,False,False,False,False,False,False,False,False,False,True
382,-0.244937,0.827377,0.202762,-0.474545,-0.473674,-0.488854,0.0,0.0,0.0,-0.282843,...,False,False,False,False,False,False,False,False,False,True
704,1.006962,0.827377,-0.258337,0.432793,-0.473674,-0.49028,0.0,0.0,0.0,-0.282843,...,False,False,False,False,False,False,False,False,False,True
813,1.430741,0.827377,-1.795334,3.154809,2.008933,-0.018709,0.0,0.0,0.0,-0.282843,...,False,False,False,False,False,False,False,False,False,True


In [9]:
clf = ClassifierModel()

In [10]:
y_train = y_train.astype(int)
clf.train(X_train, y_train)

train terminated seccussfully


In [11]:
y_test = y_test.astype(int)
clf.evaluate(X_test, y_test)

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       105
           1       0.86      0.65      0.74        74

    accuracy                           0.81       179
   macro avg       0.82      0.79      0.79       179
weighted avg       0.82      0.81      0.80       179



{'0': {'precision': 0.7886178861788617,
  'recall': 0.9238095238095239,
  'f1-score': 0.8508771929824561,
  'support': 105.0},
 '1': {'precision': 0.8571428571428571,
  'recall': 0.6486486486486487,
  'f1-score': 0.7384615384615385,
  'support': 74.0},
 'accuracy': 0.8100558659217877,
 'macro avg': {'precision': 0.8228803716608595,
  'recall': 0.7862290862290863,
  'f1-score': 0.7946693657219973,
  'support': 179.0},
 'weighted avg': {'precision': 0.816946645124871,
  'recall': 0.8100558659217877,
  'f1-score': 0.804403682174926,
  'support': 179.0}}

In [15]:
probs = clf.predict_proba(X_test)
probs

array([[0.81, 0.19],
       [0.93, 0.07],
       [0.96, 0.04],
       [0.12, 0.88],
       [0.51, 0.49],
       [0.2 , 0.8 ],
       [0.4 , 0.6 ],
       [0.89, 0.11],
       [0.47, 0.53],
       [0.15, 0.85],
       [0.6 , 0.4 ],
       [0.97, 0.03],
       [0.9 , 0.1 ],
       [0.97, 0.03],
       [0.92, 0.08],
       [0.11, 0.89],
       [0.59, 0.41],
       [0.57, 0.43],
       [0.95, 0.05],
       [0.8 , 0.2 ],
       [0.98, 0.02],
       [0.85, 0.15],
       [0.71, 0.29],
       [0.98, 0.02],
       [0.99, 0.01],
       [0.92, 0.08],
       [0.78, 0.22],
       [0.94, 0.06],
       [0.83, 0.17],
       [0.63, 0.37],
       [0.94, 0.06],
       [0.67, 0.33],
       [0.85, 0.15],
       [0.61, 0.39],
       [0.97, 0.03],
       [0.9 , 0.1 ],
       [0.59, 0.41],
       [0.45, 0.55],
       [0.19, 0.81],
       [1.  , 0.  ],
       [0.92, 0.08],
       [0.97, 0.03],
       [0.97, 0.03],
       [0.96, 0.04],
       [0.52, 0.48],
       [0.92, 0.08],
       [0.93, 0.07],
       [0.97,

In [13]:
clf.cross_validate(X_train, y_train)

Cross-validation scores: [0.7972028  0.84615385 0.82394366 0.79577465 0.82394366]
Mean accuracy: 0.8174


array([0.7972028 , 0.84615385, 0.82394366, 0.79577465, 0.82394366])

In [14]:
clf.save_model("random_forest_model.pkl")

✅ Model saved to random_forest_model.pkl
