In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
df = pd.read_excel("IndiaWeather.xlsx")
df

Unnamed: 0,อุณหภูมิ,ความชื้น,ปริมาณ PM2.5,ปริมาณ PM10,ปริมาณไนโตรเจน,ปริมาณซัลเฟอร์,ปริมาณคาร์บอน,ระยะห่างจากโรงงาน,ความหนาแน่นประชากร,คุณภาพอากาศ
0,24.7,53.8,2.1,8.7,25.1,21.8,0.88,10.0,310,ปานกลาง
1,25.8,65.6,12.7,18.5,12.3,26,1.02,0.0,297,ดี
2,26.6,55.2,26.6,39.1,?,25.8,0.54,0.6,316,ปานกลาง
3,24.3,63,2.5,13.8,15.9,3.7,1.3,6.6,270,แย่
4,23.3,73.2,19.9,37.2,17.1,19.6,1.15,1.7,319,ดี
...,...,...,...,...,...,...,...,...,...,...
495,27.3,59.5,65.7,73.5,18.5,9.6,0.51,0.2,290,ปานกลาง
496,22.5,58.6,46.4,57.8,10.7,27.6,1.13,4.1,293,ปานกลาง
497,24.4,?,31.5,40.2,12.5,2,0.66,0.3,264,ปานกลาง
498,19.2,50.7,56.8,65.6,14.1,13,0.45,6.4,322,ปานกลาง


In [3]:
df = df[~df.isin(['?']).any(axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 456 entries, 0 to 498
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   อุณหภูมิ            456 non-null    object 
 1   ความชื้น            456 non-null    object 
 2   ปริมาณ PM2.5        456 non-null    object 
 3   ปริมาณ PM10         456 non-null    object 
 4   ปริมาณไนโตรเจน      456 non-null    object 
 5   ปริมาณซัลเฟอร์      456 non-null    object 
 6   ปริมาณคาร์บอน       456 non-null    object 
 7   ระยะห่างจากโรงงาน   456 non-null    float64
 8   ความหนาแน่นประชากร  456 non-null    int64  
 9   คุณภาพอากาศ         456 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 39.2+ KB


# Know the data to mapping

In [4]:
df['คุณภาพอากาศ'].unique()

array(['ปานกลาง', 'ดี', 'แย่', 'อันตรายต่อสุขภาพ'], dtype=object)

# Ordinal Target Mapping

In [5]:
map1 = {
    'อันตรายต่อสุขภาพ': 0,
    'แย่': 1,
    'ปานกลาง': 2,
    'ดี': 3
}
df = df.copy()

df['คุณภาพอากาศ'] = df['คุณภาพอากาศ'].map(map1)

# Sampling

In [6]:
X = df.drop('คุณภาพอากาศ', axis=1)
y = df['คุณภาพอากาศ']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Attempt 1: DecisionTreeClassifier + GridSearchCV

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
params = {
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5]
}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42),
                    param_grid=params, 
                    cv=5,
                   verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END max_depth=3, min_samples_l

In [9]:
grid.score(X_test, y_test)

0.2391304347826087

# Attempt 2: RandomForestClassifier + GridSearchCV

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
parameters_grid = {
    'n_estimators': [50, 100, 150],        
    'max_depth': [3, 5, 7],              
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 2, 5],    
    'max_features': ['sqrt', 'log2'], 
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                    param_grid=params, 
                    cv=5,
                   verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)       

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END max_depth=3, min_samples_l

In [12]:
grid.score(X_test, y_test)

0.43478260869565216

# Attemp 3: GradientBoostingClassifier + GridSearchCV

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
parameters_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [2, 3],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8]
}

grid = GridSearchCV(GradientBoostingClassifier(random_state=42), 
                    param_grid=parameters_grid, 
                    cv=5, 
                    verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=0.8; total time=   0.2

In [15]:
grid.score(X_test, y_test)

0.358695652173913

# NAH