![](https://raw.githubusercontent.com/IqmanS/Machine-Learning-Notebooks/main/digit_recognizer/banner.png)
# 🔍 **Digit Recognizer Baseline**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark") # Theme for plots as Dark
print("Setup Complete")

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv
Setup Complete


In [2]:
train_path = "../input/digit-recognizer/train.csv"
test_path = "../input/digit-recognizer/test.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [3]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [4]:
cols = [i for i in train_data.columns if i!="label"]
seed = np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(train_data[cols], train_data["label"], test_size=0.3,random_state=seed)

In [5]:
# train_x = train_data[cols]  # Full training data
# train_y = train_data["Survived"] # Full training data

train_x = X_train
train_y = y_train

In [6]:
rfmodel = RandomForestClassifier(random_state=seed)
rfmodel.fit(train_x,train_y)
print("Accuracy of RF:",accuracy_score(y_test, rfmodel.predict(X_test))*100)

Accuracy of RF: 96.2063492063492


In [7]:
gbmodel = GradientBoostingClassifier(random_state=seed)
gbmodel.fit(train_x,train_y)
print("Accuracy of GB:",accuracy_score(y_test, gbmodel.predict(X_test))*100)

Accuracy of GB: 94.13492063492063


In [8]:
hgbmodel = HistGradientBoostingClassifier(random_state=seed)
hgbmodel.fit(train_x,train_y)
print("Accuracy of HGB:",accuracy_score(y_test, hgbmodel.predict(X_test))*100)

Accuracy of HGB: 97.12698412698413


In [9]:
xgbmodel = XGBClassifier(random_state=seed)
xgbmodel.fit(train_x,train_y)
print("Accuracy of XGB:",accuracy_score(y_test, xgbmodel.predict(X_test))*100)

Accuracy of XGB: 97.16666666666667


In [10]:
lgbmmodel = LGBMClassifier(random_state=seed)
lgbmmodel.fit(train_x,train_y)
print("Accuracy of LGBM:",accuracy_score(y_test, lgbmmodel.predict(X_test))*100)

Accuracy of LGBM: 97.12698412698413


In [11]:
catmodel = CatBoostClassifier(random_state=seed,verbose=False)
catmodel.fit(train_x,train_y)
print("Accuracy of CAT:",accuracy_score(y_test, catmodel.predict(X_test))*100)

Accuracy of CAT: 96.61904761904762


In [18]:
vcmodel = VotingClassifier([("rf",rfmodel),("hgb",hgbmodel),("gb",gbmodel),("cat",catmodel)])
vcmodel.fit(train_x,train_y)
print("Accuracy of VC:",accuracy_score(y_test, vcmodel.predict(X_test))*100)

In [13]:
models = {"rf":rfmodel,"gb":gbmodel,"hgb":hgbmodel,"xgb":xgbmodel,"lgbm":lgbmmodel,"cat":catmodel,"vc":vcmodel}

In [19]:
for j in models.keys():
    submissions = test_data.copy()
    prediction = models[j].predict(test_data)
    
    colsToDrop = [i for i in test_data.columns if i!="ImageId"]
    
    submissions.drop(colsToDrop,inplace=True,axis=1)
    submissions["ImageId"] = test_data.index +1
    submissions["Label"] = prediction
    
    name = f"submission_{j}.csv"
    submissions.to_csv(name,header=True,index=False)    