In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
pd.set_option('display.max_columns', 500)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [3]:
data = pd.read_csv('/kaggle/input/formulaaihackathon2022/weather.csv', low_memory=False)
data

In [4]:
data.columns

In [5]:
data.info()

In [6]:
data.nunique()

In [7]:
#features with only single value 
constants = data.nunique()[data.nunique()==1].keys()
constants

In [8]:
#dropping the columns which are irrelevant 
drop_col=['M_SECONDARY_PLAYER_CAR_INDEX','TIMESTAMP','M_FORECAST_ACCURACY','Unnamed: 58','M_IS_SPECTATING','GAMEHOST','M_SPECTATOR_CAR_INDEX','M_SLI_PRO_NATIVE_SUPPORT','M_SAFETY_CAR_STATUS','M_NETWORK_GAME']

In [9]:
data = data.drop(columns=drop_col+list(constants))


In [10]:
data.describe()

In [11]:
data.isna().sum() * 100 / len(data)

In [12]:
#removing these two because of null values
data.drop(columns=['M_ZONE_START', 'M_ZONE_FLAG'], inplace=True)
data

In [13]:
data.dropna(inplace=True)

In [14]:
data.isna().sum()

In [15]:
data.reset_index(inplace=True, drop=True)

In [16]:
import seaborn as sns
sns.barplot(x="M_WEATHER", y=data.index, data=data)

In [17]:
for col in data.drop(columns=['M_WEATHER']).columns:
    sns.histplot(x=col, data= data)
    plt.show()

In [18]:
for col in data.drop(columns=['M_WEATHER']).columns:
    sns.boxplot(x=col, data= data)
    plt.show()

In [19]:
corr = data.corr()
corr

In [20]:
plt.figure(figsize=(30,30))
sns.heatmap(corr, cmap='RdBu_r', vmax=1, vmin=-1)
plt.show()

# Modelling

In [21]:
X = data.drop(columns=['M_WEATHER'])
y = data['M_WEATHER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [22]:
xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
objective='multi:softmax',
nthread=4,
num_class=6,
 seed=33,tree_method='gpu_hist', gpu_id=0)

In [23]:
xgb.fit(X_train, y_train)

In [24]:
pred = xgb.predict(X_test)

In [25]:
from sklearn.metrics import classification_report 
print(classification_report(pred,y_test))

In [26]:
fig, ax = plt.subplots(figsize= (30,15))

sns.barplot(x = X_train.columns, y=xgb.feature_importances_)
ax.set_title('Feature Importance')
plt.xticks(rotation=90);

# CATBOOST

In [27]:
from catboost import CatBoostClassifier


In [28]:
model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0:3')


In [29]:
model.fit(X_train,
          y_train,
          verbose=False)

In [30]:
pred = model.predict(X_test)

In [31]:
print(classification_report(pred,y_test))

In [32]:
plt.figure(figsize=(20,15))
sns.barplot(x=X_train.columns, y=model.get_feature_importance())
plt.xticks(rotation=90);


In [33]:
from sklearn.decomposition import PCA

In [34]:
pca = PCA(0.80)
pca_train = pca.fit_transform(X_train)
pca_test = pca.transform(X_test)

In [35]:
xgbp = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
objective='multi:softmax',
nthread=4,
num_class=6,
 seed=33,tree_method='gpu_hist', gpu_id=0)

In [38]:
xgbp.fit(pca_train,y_train)

In [39]:
pred = xgbp.predict(pca_test)

In [41]:
print(classification_report(pred,y_test))