In [None]:
import pandas as pd
import seaborn as sns

df = pd.read_csv("../Donnees/cleaned_data.csv")


# Visualisation

In [None]:
number_columns = ["rating", "retirement", "time", "price", "review_count", "rating_value", "twenty_four_hours", "all_time"]
dfn = df[number_columns]

In [None]:
from sklearn.ensemble import IsolationForest

forest = IsolationForest()
outliers = forest.fit_predict(dfn)



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


dfn = dfn.drop(index=np.where(outliers==-1)[0])
scaler = StandardScaler()  # Standardize numerical variables
dfn = pd.DataFrame(scaler.fit_transform(dfn), columns=number_columns)


In [None]:
sns.pairplot(dfn)
plt.savefig("pairplot.png")

In [None]:
sns.heatmap(dfn.corr(),cmap="coolwarm")

In [None]:
dfn["price"].plot.hist()
plt.xlabel("Price")

In [None]:
df['rating'].plot.hist()
plt.xlabel("Rating")

In [None]:
dfn["price"].plot.box()

In [None]:
df.describe()

# Traitement des colonnes

In [None]:
import ast

def dummies_list(df):
    df2 = pd.get_dummies(pd.DataFrame(df.values.tolist()), prefix_sep='', prefix='')
    merged_columns = {}
    for col_name, col_data in df2.items():
        if col_name.strip() not in merged_columns:
            merged_columns[col_name.strip()] = col_data
        else:
            merged_columns[col_name.strip()] += col_data
    merged_platform = pd.DataFrame(merged_columns)
    return merged_platform

def X_with_dummies_genres(X):
    X = X.dropna(axis="rows")
    X['genres'] = X['genres'].astype(str).apply(ast.literal_eval)

    X_genre = dummies_list(X.genres)

    #X = X.drop(columns=["genres"])

    #X = pd.merge(X,X_genre,how = "left",left_index=True,right_index=True)

    return X_genre


In [51]:
nb_tags = 150


X_genre = X_with_dummies_genres(df)
X_genre = X_genre.drop(index=np.where(outliers==-1)[0])
colsum = list(zip(X_genre.columns, X_genre.sum()))
to_drop = sorted(colsum, key=lambda x:x[1], reverse=True)[nb_tags:]
to_drop = [e[0] for e in to_drop]
X_genre = X_genre.drop(to_drop, axis=1)
X_genre.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['genres'] = X['genres'].astype(str).apply(ast.literal_eval)


(2335, 150)

In [None]:
X_genre = X_with_dummies_genres(df)
X_genre = X_genre.drop(index=np.where(outliers==-1)[0])


# reduction de dimension

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_genre)

summary = pd.DataFrame({"variances":pca.explained_variance_,"ratio":pca.explained_variance_ratio_,"cumulative ratio":np.cumsum(pca.explained_variance_ratio_)})
(100*summary['cumulative ratio']).plot.bar()

# Analyses

### PCA

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dfn)
data = scaler.transform(dfn) # normalisation
dataframe = pd.DataFrame(data)
pca = PCA()
pca.fit(dataframe)

summary = pd.DataFrame({"variances":pca.explained_variance_,"ratio":pca.explained_variance_ratio_,"cumulative ratio":np.cumsum(pca.explained_variance_ratio_)})
(100*summary['cumulative ratio']).plot.bar()

In [None]:
pca_data = pca.transform(dataframe)
pca_data = pd.DataFrame(pca_data).drop([5,6,7],axis='columns') # on ne garde que les 3 premières dimensions (80% de l'information)

# Cercle de correlation 
n = data.shape[0] # nb of individuals
p = data.shape[1] # nb of variables
print(n, '  ', p)
eigval = (n-1) / n * pca.explained_variance_ # eigen values
sqrt_eigval = np.sqrt(eigval)
corvar = np.zeros((p,p)) # empty matrix for coordinates
for k in range(p):
    corvar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
# on modifie pour avoir un dataframe
coordvar = pd.DataFrame({'id': dfn.columns, 'COR_1': corvar[:,0], 'COR_2': corvar[:,1]})

fig, axes = plt.subplots(figsize = (6,6))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
# Ajout des axes
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
# Ajout des noms des variables
for j in range(p):
    axes.text(coordvar["COR_1"][j],coordvar["COR_2"][j], coordvar["id"][j])
# Ajout du cercle
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()

In [None]:
plt.scatter(pca_data[0],pca_data[1],s=5)
for i in range(pca.components_.shape[1]):
    plt.arrow(0,0,pca.components_[0,i]*10,pca.components_[1,i]*10,alpha=0.5)
    plt.text(pca.components_[0,i]*10,pca.components_[1,i]*10,dfn.columns[i])

In [None]:
from sklearn.cluster import KMeans
tab20 = plt.get_cmap('tab20')

km = KMeans(n_clusters=6)
pred = km.fit_predict(data)
plt.scatter(pca_data[0],pca_data[1],color = [tab20.colors[pred[i]] for i in range(len(pca_data))],s=5)


real_centers = np.exp(pca.inverse_transform(km.cluster_centers_))
fig, axs = plt.subplots(km.n_clusters//2, 2, sharey=True,sharex=True)
for i,k in enumerate(real_centers):
    axs.flatten()[i].bar(range(len(k)),k,color=tab20.colors[i])
    axs.flatten()[i].set_xticks(range(len(k)))
    axs.flatten()[i].set_xticklabels(dfn.columns,rotation="vertical")

### Prédiction du rating en fonction du genre, de la platforme, de la date de sortie et du prix.

In [None]:

def X_with_dummies_genre_platform(X):
    X = X.dropna(axis="rows")
    X['platform'] = X['platform'].astype(str).apply(ast.literal_eval) # certaines listes sont enregistrées en str donc on remet tout en listes
    X['genre'] = X['genre'].astype(str).apply(ast.literal_eval)

    X_platform = dummies_list(X.platform)
    X_genre = dummies_list(X.genre)

    X = X.drop(columns=["platform","genre"])

    X = pd.merge(X,X_platform,how = "left",left_index=True,right_index=True)
    X = pd.merge(X,X_genre,how = "left",left_index=True,right_index=True)

    return X

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


# Preprocessing

X = df[['platform', 'genre', 'date', 'price', "time"]]
y = df['rating']
X = X_with_dummies_genre_platform(X)
X = X.drop(index=np.where(outliers==-1)[0])
y = y.drop(index=np.where(outliers==-1)[0])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = list(y_train)
y_test = list(y_test)


model = LinearRegression()

#model = Ridge()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


### Prédiction du retirement en fonction du rating, de la plateforme, du genre, de la date et du prix

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Preprocessing
X = df[['rating',"pourcentage_pos", 'platform', 'genre', 'date', 'price','time',"twenty_four_hours","all_time"]]
y = df['retirement']

X = X_with_dummies_genre_platform(X)

X = X.drop(index=np.where(outliers==-1)[0])
y = y.drop(index=np.where(outliers==-1)[0])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict retirement on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')



## Clusters

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

X = df[['platform', 'genre', 'rating']]
X = X_with_dummies_genre_platform(X)
#scaler = StandardScaler()  # Standardize numerical variables
#X_scaled = scaler.fit_transform(X_encoded)



In [None]:
# t-SNE
from sklearn.manifold import TSNE

tab20 = plt.get_cmap('tab20')

km = KMeans(n_clusters=5)
km.n_clusters = 8
pred = km.fit_predict(X)

tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(X)
plt.figure(figsize=(8, 6))
plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=pred, cmap='tab20', s=10)
plt.title('t-SNE Visualization')
plt.show()