### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv('weather.csv',low_memory=False)

In [3]:
df = df[(df['M_NUM_WEATHER_FORECAST_SAMPLES'] != 0) & (df['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'] != 0)]

In [4]:
df = df.drop([
    'M_PACKET_FORMAT', 
    'M_GAME_MAJOR_VERSION',  
    'M_GAME_MINOR_VERSION',  
    'M_PACKET_VERSION', 
    'M_PACKET_ID', 
    'M_SECONDARY_PLAYER_CAR_INDEX',
    'M_SLI_PRO_NATIVE_SUPPORT', 
    'Unnamed: 58',
    'GAMEHOST',
    'M_SESSION_UID',
    'M_SESSION_TIME',
    'M_FRAME_IDENTIFIER',
], axis=1)

In [5]:
pd.to_datetime(df['TIMESTAMP'], unit='s')

17017     2022-01-21 00:16:50
17018     2022-01-21 00:16:50
17019     2022-01-21 00:16:50
17020     2022-01-21 00:16:50
17021     2022-01-21 00:16:50
                  ...        
3572287   2022-01-15 23:14:51
3572288   2022-01-15 23:14:51
3572289   2022-01-15 23:14:51
3572290   2022-01-15 23:14:51
3572291   2022-01-15 23:14:51
Name: TIMESTAMP, Length: 1393074, dtype: datetime64[ns]

In [6]:
df_subset = df[['TIMESTAMP', 'M_AIR_TEMPERATURE'] + df.columns.tolist()[31:39]]
'rows with na:', df_subset.shape[0] - df_subset.dropna().shape[0]

('rows with na:', 748671)

In [7]:
df_subset

Unnamed: 0,TIMESTAMP,M_AIR_TEMPERATURE,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_WEATHER,M_AI_DIFFICULTY
17017,1.642724e+09,25,,,,,,,0,31
17018,1.642724e+09,25,,,,,,,0,31
17019,1.642724e+09,25,,,,,,,0,31
17020,1.642724e+09,25,,,,,,,0,31
17021,1.642724e+09,25,,,,,,,0,31
...,...,...,...,...,...,...,...,...,...,...
3572287,1.642288e+09,25,0.0,33.0,2.0,25.0,2.0,3.0,0,110
3572288,1.642288e+09,25,0.0,33.0,2.0,25.0,2.0,3.0,0,110
3572289,1.642288e+09,25,0.0,33.0,2.0,25.0,2.0,3.0,0,110
3572290,1.642288e+09,25,0.0,33.0,2.0,25.0,2.0,3.0,0,110


In [8]:
from sklearn import preprocessing

x = df_subset.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

df_norm = pd.DataFrame(x_scaled, columns=df_subset.drop(['M_WEATHER', 'TIMESTAMP'], axis=1).columns.tolist())
df_norm['TIMESTAMP'] = pd.to_datetime(df_subset['TIMESTAMP'], unit='s', dayfirst=True).reset_index(drop=True)
df_norm['M_WEATHER'] = df_subset['M_WEATHER'].reset_index(drop=True)
df_norm = df_norm.set_index('TIMESTAMP')
df_norm

Unnamed: 0_level_0,M_AIR_TEMPERATURE,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_AI_DIFFICULTY,M_WEATHER
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
...,...,...,...,...,...,...,...,...,...
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0


In [9]:
df_norm['M_WEATHER'] = df_norm['M_WEATHER'].apply(lambda x: str(x))
df_norm

Unnamed: 0_level_0,M_AIR_TEMPERATURE,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_AI_DIFFICULTY,M_WEATHER
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
2022-01-21 00:16:50,0.666667,,,,,,,0.0,0
...,...,...,...,...,...,...,...,...,...
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0
2022-01-15 23:14:51,0.666667,0.0,0.8125,1.0,0.666667,1.0,0.021739,1.0,0


In [10]:
df_norm.dropna(inplace=True)

In [11]:
df = df_norm
Y = df['M_WEATHER']
X = df.drop(columns=['M_WEATHER'],inplace=False)

In [12]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(X,Y,test_size = 0.2,train_size = 0.8,random_state = 42)
X_train, X_cv, y_train, y_cv = train_test_split(X,y,test_size = 0.25,train_size = 0.75,random_state = 42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight = 'balanced')
model.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced')

In [15]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_cv, y_cv, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.982 (0.001)


In [16]:
from sklearn.metrics import accuracy_score
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

0.9818126799140292
