### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
!pip install pandas
!pip install seaborn



In [14]:
# -*- coding: utf-8 -*-
"""WeatherForecast.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1BddQDCoNG2gDbheBcCiAiBua6eCHouQV

# Only with gooogle colab
from google.colab import drive
drive.mount("/content/drive")

# CLASS PREPROCESSING DATASET
"""

# Standard libraries
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

class PreprocessDataset:
    """
    Class object to clean the weather.csv dataset.

    Parameters
    ____________________________________________________________________________
    filepath_load: str (filepath of the file to be loaded)
    filepath_save: str (filepath of the file to be saved)
    cols_to_drop: list of str (columns to drop from the dataset)
    cols_idx_groupby: list of str (cols used to groupby)
    """

    def __init__(self, filepath_load, filepath_save, cols_to_drop, cols_idx_groupby):
        self.filepath_load = filepath_load
        self.filepath_save = filepath_save
        self.cols_to_drop = cols_to_drop
        self.cols_idx_groupby = cols_idx_groupby
      
    
    def __load_file(self):
        start_time = time.time()
        with open(self.filepath_load, 'rb') as handle:
            self.dataset_raw = pickle.load(handle)
        total_time = - start_time + time.time()
        print(f"File has been loaded in {total_time} seconds.")

    def __load_file_dataframe(self):
        start_time = time.time()
        df = pd.read_csv(filepath_load, low_memory=False)
        total_time = - start_time + time.time()
        print(f"File has been loaded in {total_time} seconds.")
        return df

    def __clean_dataframe(self, df):

        start_time = time.time()

        # Drop col with std = 0 and features not relevant to prediction weather
        cleaned_df = df.copy()
        cleaned_df.drop(columns_to_drop, axis=1, inplace=True) 

        # Groupby session link identifier
        list_session_df = self.__groupby_dataframe(cleaned_df)

        # Drop session with number 13 and 0: time trial, unknown
        for i, dataframe in enumerate(list_session_df):
            session_type = list_session_df[0]["M_SESSION_TYPE"].value_counts().index[0]
        if (session_type == 13) or (session_type == 0):
            del list_session_df[i]

        # Drop rows with NaN
        for i, dataframe in enumerate(list_session_df):
            dataframe.drop_duplicates(keep="first", inplace=True)
            dataframe.dropna(axis=0, inplace=True)

        # Drop rows with all zeros
        for i, dataframe in enumerate(list_session_df):
            list_session_df[i] = dataframe[dataframe['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'] != 0]

        # Keep only dataframe where M_SESSION_TYPE == M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE
        for i, dataframe in enumerate(list_session_df):
            list_session_df[i] = list_session_df[i][list_session_df[i]["M_SESSION_TYPE"] == \
                                           list_session_df[i]["M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE"]].copy()
            list_session_df[i] = list_session_df[i][(list_session_df[i]["M_TIME_OFFSET"] != 45)  &
                                           (list_session_df[i]["M_TIME_OFFSET"] != 120) & 
                                           (list_session_df[i]["M_TIME_OFFSET"] != 90)] 
        
        print(f"Preprocessing is over, time taken: {- start_time + time.time()}")
        return list_session_df
 
    def __groupby_dataframe(self, cleaned_df):
        grouped_df = cleaned_df.groupby(self.cols_idx_groupby) # per sessionID, player car index
        list_session_df = []
        for key,item in grouped_df:
            list_session_df.append(grouped_df.get_group(key))
        return list_session_df

    def __save_file(self, list_session_df):
        start_time = time.time()
        with open(self.filepath_save, 'wb') as handle:
            pickle.dump(list_session_df, handle)
        total_time = - start_time + time.time()
        print(f"File has been saved in {total_time} seconds.")

    def run_proprocessing(self):
        dataframe_raw = self.__load_file_dataframe()
        list_session_df = self.__clean_dataframe(dataframe_raw)
        answer_save = int(input("Do you want to save the file? 1: yes, 0: no"))
        if answer_save == 1:
            self.__save_file(list_session_df)
            return list_session_df
        else:
            print("File not saved")
            return list_session_df

    
    def unix_to_timestamp(self, list_session_df):
        for i, dataframe in enumerate(list_session_df):
            try:
                list_session_df[i]["TIMESTAMP"] = \
                list_session_df[i]["TIMESTAMP"].apply(lambda x: pd.to_datetime(x).value / 1e9)
            except:
                print(i)
        return list_session_df


    def timestamp_to_unix(self, list_session_df):
        for i, dataframe in enumerate(list_session_df):
            try:
                list_session_df[i]["TIMESTAMP"] = \
                list_session_df[i]["TIMESTAMP"].apply(lambda x: pd.to_datetime(x, unit='s'))
            except:
                print(i)
        return list_session_df

# Parameters
columns_to_drop = ["M_PLAYER_CAR_INDEX", "M_FRAME_IDENTIFIER", "M_SESSION_TIME", "GAMEHOST", "M_SECONDARY_PLAYER_CAR_INDEX", 
                   "M_PIT_STOP_WINDOW_IDEAL_LAP", "M_GAME_MINOR_VERSION", "M_GEARBOX_ASSIST", 
                   "M_SLI_PRO_NATIVE_SUPPORT", "M_SAFETY_CAR_STATUS",
                   "M_SEASON_LINK_IDENTIFIER", "M_PIT_ASSIST", "M_FORMULA", "M_WEEKEND_LINK_IDENTIFIER", 
                   "M_PIT_STOP_REJOIN_POSITION", "M_PIT_SPEED_LIMIT", "M_DYNAMIC_RACING_LINE",
                   "M_PIT_RELEASE_ASSIST", "M_PIT_STOP_WINDOW_LATEST_LAP", 
                   "M_PACKET_FORMAT", "M_GAME_MAJOR_VERSION", "M_PACKET_VERSION", 
                   "M_PACKET_ID", "M_AI_DIFFICULTY", 
                   "M_IS_SPECTATING", "M_DRSASSIST", "M_NUM_MARSHAL_ZONES", "M_STEERING_ASSIST",
                   "M_NETWORK_GAME","M_ERSASSIST","M_BRAKING_ASSIST","M_SPECTATOR_CAR_INDEX",
                   "M_DYNAMIC_RACING_LINE_TYPE"]


filepath_save = 'dataset_reoganized_v7.pkl'
filepath_load = 'weather_json-2.csv'
cols_idx_groupby = "M_SESSION_LINK_IDENTIFIER"

# Create instance of class PreprocessingDataset
prep = PreprocessDataset(filepath_load, filepath_save, columns_to_drop, cols_idx_groupby)
list_sessions_dataframe = prep.run_proprocessing()

File has been loaded in 8.606765747070312 seconds.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Preprocessing is over, time taken: 6.435360431671143


Do you want to save the file? 1: yes, 0: no 1


File has been saved in 0.38241004943847656 seconds.
