In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns


In [None]:
dt = pd.read_csv('/content/googleplaystore.csv', encoding='ISO-8859-1')  #
df = dt.copy()

In [None]:
df.head()

# Edit data types

In [None]:
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

In [None]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))  # Convert 'M' values to float (megabytes)
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024  # Convert 'k' values to megabytes
    else:
        return np.nan  # For "Varies with device" or other text
df['Size'] = df['Size'].apply(lambda x: convert_size(x) if isinstance(x, str) else np.nan)

In [None]:
df['Installs'] = pd.to_numeric(df['Installs'].str.replace(',', '').str.replace('+', ''), errors='coerce').fillna(0).astype(int)

In [None]:
df['Price'] = pd.to_numeric(df['Price'].str.replace('$', ''), errors='coerce')

In [None]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'], infer_datetime_format=True, errors='coerce')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

# scaling

In [None]:
from scipy import stats
# Select numerical columns to scale
numeric_features = ['Rating', 'Reviews','Size']

# Log transformation
df[numeric_features] = np.log1p(df[numeric_features])  # log(1 + x) to handle zero values

In [None]:
for col in df[numeric_features]:
    fig = px.histogram(df, x=col, title=f'Distribution of {col}')
    fig.show()

In [None]:
for col in df[numeric_features]:
    fig = px.box(df, x=col, title=f'Distribution of {col}')
    fig.show()

In [None]:
df.describe().T

# Fill in missing values

In [None]:
df['Size'] = df['Size'].fillna(df['Size'].mean())

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

##  1- top 10 most installed applications.

In [None]:
df_sorted = df.sort_values(by='Installs+', ascending=False)
top_10_apps = df_sorted.head(4)

px.bar(top_10_apps, x='App', y='Installs+', title='Top 10 Installed Applications',
             labels={'App': 'Application', 'Installs': 'Number of Installs'},
             category_orders={'App': top_10_apps['App'].tolist()})

## 2- Number of Updates per App in 2017

In [None]:
df_2017 = df[df['Last Updated'].dt.year == 2017]


app_updates_2017 = df_2017['App'].value_counts().reset_index().head()
app_updates_2017.columns = ['App', 'Update Count']

px.bar(app_updates_2017, x='App', y='Update Count',
             title='Number of Updates per App in 2017',
             labels={'App': 'Application', 'Update Count': 'Number of Updates'})

## 3- Most Expensive Application in Each Category

In [None]:
def get_most_expensive_apps(group):
    return group.nlargest(1, 'Price')

# Group by 'Category' and apply the function
most_expensive_apps = df.groupby('Category').apply(get_most_expensive_apps).reset_index(drop=True)

# Create a bar plot
fig = px.bar(most_expensive_apps, x='Category', y='Price', text='App', title='Most Expensive Application in Each Category',
             labels={'Category': 'Category', 'Price': 'Price ($)'},
             height=600)

# Improve clarity by showing app names on the bars
fig.update_traces(texttemplate='%{text}', textposition='outside')

# Customize the layout for better clarity
fig.update_layout(xaxis={'categoryorder':'total descending'},
                  yaxis_title='Price ($)',
                  xaxis_title='Category')

fig.show()


## 4- Top 3 Categories with the Most Reviews

In [None]:
category_reviews = df.groupby('Category')['Reviews'].sum().reset_index()

# Sort the categories by the number of reviews in descending order
category_reviews_sorted = category_reviews.sort_values(by='Reviews', ascending=False)

# Select the top 3 categories with the most reviews
top_3_categories = category_reviews_sorted.head(3)

# Create a bar plot
fig = px.bar(top_3_categories, x='Category', y='Reviews', title='Top 3 Categories with the Most Reviews',
             labels={'Category': 'Category', 'Reviews': 'Number of Reviews'},
             text='Reviews')

# Improve clarity by showing review numbers and category names on the bars
fig.update_traces(texttemplate='%{text}', textposition='outside')

# Customize the layout for better clarity
fig.update_layout(yaxis_title='Number of Reviews',
                  xaxis_title='Category')

fig.show()


# Modify columns of object type and convert them to LabelEncoder columns.

In [None]:
Labol_data = ['Category','Type','Content Rating','Genres','Android Ver']

from sklearn.preprocessing import LabelEncoder , OneHotEncoder

# Initialize LabelEncoder
le = LabelEncoder()
for column in Labol_data:
    df[column] = le.fit_transform(df[column])

# Change the names of some columns

In [None]:
df.rename(columns= {'Size':'Size(Megabyte)','Installs':'Installs+'}, inplace = True)

# Separate the data and create two files, the first containing all the data except for the data that contains empty values ​​in the classification column, and the other containing the data that contains empty values ​​in the classification column.

In [None]:
# Separate rows where the "classification" column is empty
Rarity_prediction = df[df['Rating'].isna()]
Rarity_prediction.count()

# Remove rows with missing "classification" from the original data
cleaned_data = df.dropna(subset=['Rating'])

# Save the dataframes to separate files
Rarity_prediction.to_csv('missing_classification_data.csv', index=False)
cleaned_data.to_csv('cleaned_data.csv', index=False)


In [None]:
cleaned_data.info()

In [None]:
cleaned_data = cleaned_data.dropna()

In [None]:
cleaned_data.isnull().sum()

# Temporary data that contains only numeric values.

In [None]:
numerical_df = cleaned_data.select_dtypes(include=['number'])

In [None]:
# Plot pairplot for only numerical columns
sns.pairplot(numerical_df)

In [None]:
sns.heatmap(numerical_df.corr(), annot=True, fmt='.2f')

# Variance Inflation Factor

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

df_numeric = numerical_df.select_dtypes(include=[float, int])
df_numeric = df_numeric.dropna()

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)
df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)

In [None]:
vif_data = pd.DataFrame()

vif_data["Feature"] = df_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(df_scaled.values, i) for i in range(df_scaled.shape[1])]

vif_data    ##  0 to 5 Okay    ,5 to 10  Explore   ,10 to ~~   problem

In [None]:
numerical_df.columns

# Starting to create a Multi regression model

In [None]:
### Splitting Data
X = cleaned_data[['Category','Reviews', 'Size(Megabyte)', 'Installs+', 'Type',
       'Price', 'Content Rating', 'Genres', 'Android Ver']]
y = cleaned_data[['Rating']]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.1,random_state=796)

In [None]:
### Model Training & Building
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

### Model Evaluation & Prediction
y_pred = reg.predict(X_test)

from sklearn.metrics import r2_score
print('Accuracy: ',r2_score(y_test, y_pred))

In [None]:
# # Variables to track the best accuracy and corresponding random_state
# best_accuracy = -1  # Initialize with a value lower than any possible R²
# best_random_state = None

# # Loop over different values for random_state from 0 to 1000
# for random_state in range(1001):  # Loop from 0 to 1000 inclusive
#     # Splitting Data with varying random_state
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

#     # Model Training & Building
#     reg = LinearRegression()
#     reg.fit(X_train, y_train)

#     # Model Evaluation & Prediction
#     y_pred = reg.predict(X_test)
#     accuracy = r2_score(y_test, y_pred)

#     # Update if the current accuracy is better than the best found so far
#     if accuracy > best_accuracy:
#         best_accuracy = accuracy
#         best_random_state = random_state

# # Print the best random_state and the corresponding accuracy
# print(f'Best Random State: {best_random_state} with Accuracy: {best_accuracy}')

In [None]:
# from sklearn.linear_model import Ridge, Lasso
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error

# X = numerical_df.drop(columns=['Rating'])


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=796)

# #Ridge
# ridge_model = Ridge(alpha=1.0)
# ridge_model.fit(X_train, y_train)
# ridge_predictions = ridge_model.predict(X_test)
# ridge_mse = mean_squared_error(y_test, ridge_predictions)
# print("ridge_model.score: ",ridge_model.score(X_test, y_test))
# print("Ridge MSE:", ridge_mse)

# #Lasso
# lasso_model = Lasso(alpha=0.1, max_iter=10000)
# lasso_model.fit(X_train, y_train)
# lasso_predictions = lasso_model.predict(X_test)
# lasso_mse = mean_squared_error(y_test, lasso_predictions)
# print("lasso_model.score: ",lasso_model.score(X_test, y_test))
# print("Lasso MSE:", lasso_mse)

## Predict missing classification values ​​through the created model and store them in separate data.

In [None]:
X_rarity = Rarity_prediction[['Category', 'Reviews', 'Size(Megabyte)', 'Installs+', 'Type',
                              'Price', 'Content Rating', 'Genres', 'Android Ver']]


In [None]:
# Rarity_prediction.loc[:, 'Predicted_Rating'] = y_rarity_pred


In [None]:
# Rarity_prediction['Predicted_Rating'] = y_rarity_pred

In [None]:
Rarity_prediction.to_csv('predicted_missing_ratings.csv', index=False)

In [None]:
# Assuming 'df' is your original DataFrame
Rarity_prediction = df[df['Rating'].isna()]

# Prepare the data for prediction
X_rarity = Rarity_prediction[['Category', 'Reviews', 'Size(Megabyte)', 'Installs+', 'Type',
                              'Price', 'Content Rating', 'Genres', 'Android Ver']]

# Use the trained model to predict
y_rarity_pred = reg.predict(X_rarity)

# Add predictions to the "Rating" column using .loc to avoid the warning
Rarity_prediction.loc[:, 'Rating'] = y_rarity_pred

# Save the updated DataFrame
Rarity_prediction.to_csv('predicted_missing_ratings.csv', index=False)


In [None]:
Rarity_prediction.head()