In [1]:
import numpy as np
import pandas as pd
from supplemental_english import GOVERNMENT_CODES
from supplemental_english import REGION_CODES

In [39]:
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv('data/train.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51635 entries, 0 to 51634
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      51635 non-null  int64 
 1   plate   51635 non-null  object
 2   date    51635 non-null  object
 3   price   51635 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


In [6]:
df.head()

Unnamed: 0,id,plate,date,price
0,1,X059CP797,2024-12-26 00:00:00,65000
1,2,Y800MH790,2024-07-12 21:31:37,100000
2,3,A212TX77,2024-04-18 00:00:00,290000
3,4,P001AY199,2025-01-03 00:27:15,680000
4,5,P001AY199,2025-01-10 09:32:41,750000


In [7]:
def find_importance_values_for_plate(plate: str, gov_codes: dict) -> tuple:
    letters = plate[0] + plate[4:6]  # Extracts letters
    numbers = int(plate[1:4])  # Extracts numbers
    region_code = plate[6:]  # Extracts region code

    print(plate, "---", letters, numbers, region_code)

    for (code_letters, num_range, region), details in gov_codes.items():
        if letters == code_letters and region_code == region:
            if num_range[0] <= numbers <= num_range[1]:  # Checks if within range
                return (details[1],details[2], details[3])  # Importance values

    return (0, 0, 0)  # Ordinary plate, no government affiliation

In [8]:
forbidden = []
advantage = []
significance = []
for plate in df['plate']:
    a,b,c = (find_importance_values_for_plate(plate, GOVERNMENT_CODES))
    forbidden.append(a)
    advantage.append(b)
    significance.append(c)

X059CP797 --- XCP 59 797
Y800MH790 --- YMH 800 790
A212TX77 --- ATX 212 77
P001AY199 --- PAY 1 199
P001AY199 --- PAY 1 199
B400BB750 --- BBB 400 750
P666OM790 --- POM 666 790
C080KP777 --- CKP 80 777
E737AY550 --- EAY 737 550
C001KK750 --- CKK 1 750
P227YC77 --- PYC 227 77
A111HA777 --- AHA 111 777
P745CX797 --- PCX 745 797
Y100CP777 --- YCP 100 777
T047EM797 --- TEM 47 797
P991CC05 --- PCC 991 05
P141BY77 --- PBY 141 77
X140YM38 --- XYM 140 38
C400OP797 --- COP 400 797
Y999HM34 --- YHM 999 34
K063KK190 --- KKK 63 190
K063KK190 --- KKK 63 190
B764AA250 --- BAA 764 250
K093OE797 --- KOE 93 797
K555PA196 --- KPA 555 196
H003MB97 --- HMB 3 97
B878HA193 --- BHA 878 193
T888AX198 --- TAX 888 198
T089CY797 --- TCY 89 797
A077OE777 --- AOE 77 777
O400CT750 --- OCT 400 750
H300KP75 --- HKP 300 75
M313MM99 --- MMM 313 99
M313MM99 --- MMM 313 99
E001MY197 --- EMY 1 197
X971XX40 --- XXX 971 40
M281AX977 --- MAX 281 977
K015OO797 --- KOO 15 797
K015OO797 --- KOO 15 797
H424BE790 --- HBE 424 790
T6

In [9]:
df['forbidden'] = forbidden
df['advantage'] = advantage
df['significance'] = significance

In [10]:
df.head()

Unnamed: 0,id,plate,date,price,forbidden,advantage,significance
0,1,X059CP797,2024-12-26 00:00:00,65000,0,0,0
1,2,Y800MH790,2024-07-12 21:31:37,100000,0,0,0
2,3,A212TX77,2024-04-18 00:00:00,290000,0,0,0
3,4,P001AY199,2025-01-03 00:27:15,680000,0,0,0
4,5,P001AY199,2025-01-10 09:32:41,750000,0,0,0


In [11]:
df['forbidden'].value_counts()

forbidden
0    51632
1        3
Name: count, dtype: int64

In [12]:
df['advantage'].value_counts()

advantage
0    51037
1      598
Name: count, dtype: int64

In [13]:
df['significance'].value_counts()

significance
0    50973
3      182
2      154
5      143
4       54
6       54
1       50
8       25
Name: count, dtype: int64

In [14]:

def find_city_for_plate(plate: str, gov_codes: dict) -> tuple:
    letters = plate[0] + plate[4:6]  # Extracts letters
    numbers = int(plate[1:4])  # Extracts numbers
    region_code = plate[6:]  # Extracts region code

    print(plate, "---", letters, numbers, region_code)

    for city, num_arr in REGION_CODES.items():
        if region_code in num_arr:
            return city 

In [15]:
city = []
for plate in df['plate']:
    a = (find_city_for_plate(plate, GOVERNMENT_CODES))
    city.append(a)

X059CP797 --- XCP 59 797
Y800MH790 --- YMH 800 790
A212TX77 --- ATX 212 77
P001AY199 --- PAY 1 199
P001AY199 --- PAY 1 199
B400BB750 --- BBB 400 750
P666OM790 --- POM 666 790
C080KP777 --- CKP 80 777
E737AY550 --- EAY 737 550
C001KK750 --- CKK 1 750
P227YC77 --- PYC 227 77
A111HA777 --- AHA 111 777
P745CX797 --- PCX 745 797
Y100CP777 --- YCP 100 777
T047EM797 --- TEM 47 797
P991CC05 --- PCC 991 05
P141BY77 --- PBY 141 77
X140YM38 --- XYM 140 38
C400OP797 --- COP 400 797
Y999HM34 --- YHM 999 34
K063KK190 --- KKK 63 190
K063KK190 --- KKK 63 190
B764AA250 --- BAA 764 250
K093OE797 --- KOE 93 797
K555PA196 --- KPA 555 196
H003MB97 --- HMB 3 97
B878HA193 --- BHA 878 193
T888AX198 --- TAX 888 198
T089CY797 --- TCY 89 797
A077OE777 --- AOE 77 777
O400CT750 --- OCT 400 750
H300KP75 --- HKP 300 75
M313MM99 --- MMM 313 99
M313MM99 --- MMM 313 99
E001MY197 --- EMY 1 197
X971XX40 --- XXX 971 40
M281AX977 --- MAX 281 977
K015OO797 --- KOO 15 797
K015OO797 --- KOO 15 797
H424BE790 --- HBE 424 790
T6

In [16]:
df['city']=city

In [17]:
le = LabelEncoder()
for i in df.columns:
    df[i] = le.fit_transform(df[i])

In [18]:
df.head()

Unnamed: 0,id,plate,date,price,forbidden,advantage,significance,city
0,0,37978,8424,79,0,0,0,35
1,1,43166,6452,129,0,0,0,36
2,2,1827,5722,267,0,0,0,35
3,3,30833,8492,407,0,0,0,35
4,4,30833,8570,435,0,0,0,35


In [19]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['price']),df['price'],test_size = 0.2, random_state=2)

In [20]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41308 entries, 23478 to 23720
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   id            41308 non-null  int64
 1   plate         41308 non-null  int64
 2   date          41308 non-null  int64
 3   forbidden     41308 non-null  int64
 4   advantage     41308 non-null  int64
 5   significance  41308 non-null  int64
 6   city          41308 non-null  int64
dtypes: int64(7)
memory usage: 2.5 MB


In [21]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [37]:
clfs = {
    'SVC' : svc,#
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, #
    'RF': rfc, #
    'AdaBoost': abc, 
    'BgC': bc, #
    'ETC': etc,
    'GBDT':gbdt,
    # 'xgb':xgb
}

In [27]:
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))


In [28]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    smape_score = smape(y_test, y_pred)
    
    return smape_score

In [38]:
for name,clf in clfs.items():
    
    smape_score = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("smape - ",smape_score)

For  SVC
smape -  68.69205528983184
For  KN
smape -  90.70741623527584
For  NB
smape -  91.82123579810577
For  DT
smape -  74.65662212660982
For  LR
smape -  74.51102926142072
For  RF
smape -  65.07827720217973
For  AdaBoost
smape -  71.66445526702041
For  BgC
smape -  65.67124664063161
For  ETC
smape -  65.6420436134079
For  GBDT
smape -  87.13378628313542


In [43]:
RandomForestClassifier.fit(X_train,y_train)
test_pred = RandomForestClassifier(n_estimators=50, random_state=2).predict(test_df)

AttributeError: 'DataFrame' object has no attribute '_validate_params'

In [40]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units=50, kernel_initializer='he_uniform', activation='relu', input_dim=12))

# Adding the second hidden layer
classifier.add(Dense(units=25, kernel_initializer='he_uniform', activation='relu'))

# Adding the third hidden layer
classifier.add(Dense(units=50, kernel_initializer='he_uniform', activation='relu'))

# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='he_uniform', activation='sigmoid'))

# Compiling the ANN
classifier.compile(loss='binary_crossentropy', optimizer='Adamax', metrics=['accuracy'])

# Fitting the ANN to the Training set
model_history=classifier.fit(X_train.values, Y_train.values,validation_split=0.20, batch_size = 10, epochs = 2500)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NameError: name 'Y_train' is not defined