# Brief projet
- Etape 1 : Choisir un secteur qui vous interesse.
- Etape 2 : Trouver une problematique - identifier une target.
- Etape 3 : Sélectionner une ou des bases de données (SQL, CSV, API, scraping, etc...).
- Etape 4 : Réaliser l’analyse de donnée.
- Etape 5 : Réaliser un model de machine de learning (regression lineaire)
- Rendu : Présentation Oral d’un Notebook propre, légé et bien structuré (legend et titre sur les graphiques, abscisse et ordonnée. Faire des parties dans le notebook.).

- Optionnel : Architecture du projet en POO, RandomizeSearch, GridSearch, Learning curve.

**Outils à utiliser :**
- Analyse : Notebooke, Numpy, Pandas, Matplotlib ou Seaborn. Sklearn ou Stat model.
- Sklearn : RandomizeSearch, GridSearch, Cross validation, Train/Test Split, model de Regression Lineaire Pipeline.
- Gestion de Projet Agile: Github, Trello (ou autre outil de gestion de projet : Jira, Clickup, Teams, etc...).

In [715]:
import myfunctions
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Get Dataframe, features and target

In [None]:
df = pd.read_csv('games_data.csv', encoding = 'unicode_escape', index_col='id').drop_duplicates()
df.head(5)

In [None]:
df.dtypes

In [None]:
df.count()

In [None]:
df = df[df['price'] != df['dc_price']]

In [None]:
df.isnull().sum() / df.count().max() * 100

In [None]:
# Transformer les valeurs numériques en float

df = df[['price', 'dc_price', 'reviews', 'percent_positive']]
df = df.replace('%', '', regex=True).replace('Free to play', np.nan, regex=True).replace(',', '.', regex=True)
df = df.apply(pd.to_numeric, errors='coerce')
df[['price', 'dc_price']] = (df[['price', 'dc_price']] / 10).round(2)
df.dropna(inplace=True)

df

In [None]:
df.isnull().sum() / df.count().max() * 100


In [None]:

sns.heatmap(df.corr())

---

In [None]:
df

In [None]:
# Iteration 1: r2 Negatif
# X = df[['price']]
X = df.drop(columns=['dc_price'])
y = df['dc_price']

# Select the numeric columns
numeric_features = X.select_dtypes(include=['float', 'int']).columns

# Select the categorical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

In [713]:
X.dtypes

price               float64
reviews             float64
percent_positive    float64
dtype: object

### Define the list of regression models, scoring, strategies

In [None]:
models = [LinearRegression(), Ridge(), Lasso()]

scorings = ['r2']

strategies = ['mean', 'median', 'most_frequent', 'constant']

### Create preprocesses for each strategies

In [None]:
preprocessings = {}
for strategie in strategies:
    preprocessing = ColumnTransformer(
        [
        ('imputer', SimpleImputer(strategy=strategie), numeric_features),
        ('scaler', StandardScaler(), numeric_features),
        ('onehot', OneHotEncoder(), categorical_features)
        ]
    )
    preprocessings[strategie] = preprocessing
    print(f'Strategie: {strategie}')


### Loop and score for each strategie, scoring method, model used

In [None]:

df_tests = get_pipelines_dataframe(X,y,models,scorings,preprocessings, [5,10])
df_tests

### Sorting results by mean of scores

In [None]:
df_tests.nlargest(20, 'mean')

In [None]:
best_pipeline = df_tests.nlargest(1, 'mean')['pipeline'].tolist()[0]
best_pipeline

# Pickle the best Pipeline

In [None]:
import pickle
pickle.dump(best_pipeline, open('pipeline.pkl', 'wb'))

In [None]:
pkl = pickle.load(open('pipeline.pkl', 'rb'))
pkl.predict('k')