## Recommendation Systems: Build recommendation systems for users based on app features and ratings. ( org)

### Step - 1. Setup and Data Preparation

In [37]:
# import the libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
warnings.filterwarnings("ignore")

In [38]:
# 1. Data Loading and Initial Exploration

dataset = pd.read_csv('googleplaystore.csv')
print("Original Dataset Shape:", dataset.shape)
print(dataset.head())


Original Dataset Shape: (10841, 13)
                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres Last Updated         Current Ver   Android Ver  
0               Art & Design     7-Jan-18         

### Step - 2. Data Cleaning and Preprocessing

In [39]:
backup_dataset= dataset.copy()  # create a backup copy of dataset

In [40]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [41]:

dataset['Category']


0             ART_AND_DESIGN
1             ART_AND_DESIGN
2             ART_AND_DESIGN
3             ART_AND_DESIGN
4             ART_AND_DESIGN
                ...         
10836                 FAMILY
10837                 FAMILY
10838                MEDICAL
10839    BOOKS_AND_REFERENCE
10840              LIFESTYLE
Name: Category, Length: 10841, dtype: object

In [42]:
# Convert 'Installs' to string first to handle potential non-string entries

dataset['Installs'] = dataset['Installs'].astype(str).str.replace(r'[+,]', '', regex=True)

In [43]:
dataset = dataset[dataset['Installs'].str.isnumeric()]  # Keep only numeric values
dataset['Installs'] = dataset['Installs'].astype(int)

In [44]:
# Clean 'Price' column
dataset['Price'] = dataset['Price'].str.replace('$', '', regex=False).astype(float)

In [45]:
# Convert 'Last Updated' to datetime
dataset['Last Updated'] = pd.to_datetime(dataset['Last Updated'])

In [46]:
# Convert 'Reviews' to numeric
dataset['Reviews'] = pd.to_numeric(dataset['Reviews'], errors='coerce')

In [47]:
# Handle 'Size'

def convert_size_to_mb(size):
    if isinstance(size, str):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        elif 'Varies with device' in size:
            return np.nan  # Or a suitable placeholder
    return np.nan

In [48]:

dataset['Size'] = dataset['Size'].apply(convert_size_to_mb)
dataset['Size'] = pd.to_numeric(dataset['Size'], errors='coerce') # Handle any conversion errors

In [49]:
# Impute missing 'Size' values using median for each category

dataset['Size'] = dataset.groupby('Category')['Size'].transform(lambda x: x.fillna(x.median()))

In [50]:
# Outlier Handling (Reviews) - BEFORE SPLITTING

Q1 = dataset['Reviews'].quantile(0.25)
Q3 = dataset['Reviews'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
dataset['Reviews'] = np.clip(dataset['Reviews'], lower_bound, upper_bound)

### Step- 3. Feature Engineering

In [51]:
# App Age (in days)
dataset['App Age'] = (pd.to_datetime('2018-12-31') - dataset['Last Updated']).dt.days  # Assuming analysis date is end of 2018
dataset['App Age'] = dataset['App Age'].fillna(dataset['App Age'].median()) #handle any missing values after calculation

dataset['Features'] = dataset['Category'] + ' ' + dataset['Genres'] + ' ' + dataset['App']  #Includes App Name


#### Step - 4 Split Data (Crucially, *before* imputation and encoding)

In [52]:
# Remove rows with NaN in 'Rating' BEFORE the split
dataset = dataset.dropna(subset=['Rating'])

In [53]:
# Split Data (Crucially, *before* imputation and encoding)

X = dataset.drop('Rating', axis=1)
y = dataset['Rating']


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (7492, 14)
Testing set shape: (1874, 14)


### step - 5. Define Numerical and Categorical Features (AFTER SPLIT)

In [55]:


numerical_features = ['Reviews', 'Size', 'Installs', 'Price', 'App Age']
categorical_features = ['Type', 'Content Rating']

#### Step - 6. Imputation (AFTER SPLIT)

In [56]:
print("Starting Imputation...")

imputer_numerical = SimpleImputer(strategy='median')
# Use median for robustness
print("SimpleImputer created.")

# Impute missing values separately for each column
print("Starting imputation loop...")

Starting Imputation...
SimpleImputer created.
Starting imputation loop...


In [57]:
# Impute missing values separately for each column

for col in numerical_features:
    print(f"Imputing column: {col}")
    X_train[col] = X_train[[col]].fillna(X_train[[col]].median())
    X_test[col] = X_test[[col]].fillna(X_test[[col]].median())
    print(f"Column {col} imputed.")
print("Imputation complete.")

Imputing column: Reviews
Column Reviews imputed.
Imputing column: Size
Column Size imputed.
Imputing column: Installs
Column Installs imputed.
Imputing column: Price
Column Price imputed.
Imputing column: App Age
Column App Age imputed.
Imputation complete.


#### step -  7. Scaling (AFTER IMPUTATION)

In [58]:


scaler = MinMaxScaler()
numerical_scaled_train = scaler.fit_transform(X_train[numerical_features])
numerical_scaled_test = scaler.transform(X_test[numerical_features])

#### Step - 8. Encoding (AFTER SPLIT)

In [59]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data_train = encoder.fit_transform(X_train[categorical_features])
encoded_data_test = encoder.transform(X_test[categorical_features])

In [60]:
# Export the file for further use 

# dataset.to_csv('sorted_dataset.csv', index=False)

In [61]:
dataset.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,App Age,Features
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,358,ART_AND_DESIGN Art & Design Photo Editor & Can...
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,350,ART_AND_DESIGN Art & Design;Pretend Play Color...
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,152,ART_AND_DESIGN Art & Design U Launcher Lite – ...
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,136881.75,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,206,ART_AND_DESIGN Art & Design Sketch - Draw & Paint
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,194,ART_AND_DESIGN Art & Design;Creativity Pixel D...


### step - 9. TF-IDF Vectorization:

In [62]:
# --- TF-IDF Vectorization ---

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000) # reduced ngram_range, added max_features
tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train['Features']).toarray()
tfidf_matrix_test = tfidf_vectorizer.transform(X_test['Features']).toarray()

##### Explanation:

  ##### TfidfVectorizer: This converts the text in the "Features" column into a numerical representation that machine learning models can understand. TF-IDF (Term Frequency-Inverse Document Frequency) weighs words based on their importance within each app's description and across the entire dataset.

##### stop_words='english': This removes common English words (like "the", "a", "is") that don't contribute much to the meaning and can clutter the analysis.

##### tfidf_matrix.shape: The output will show you the dimensions of the matrix. 

### Step -10. Sparse Matrix Conversion

In [64]:
import gc  # Import garbage collection

numerical_scaled_train_sparse = csr_matrix(numerical_scaled_train, dtype=np.float32)  # Added dtype
gc.collect()
numerical_scaled_test_sparse = csr_matrix(numerical_scaled_test, dtype=np.float32) # Added dtype
gc.collect()
encoded_data_train_sparse = csr_matrix(encoded_data_train, dtype=np.float32) # Added dtype
gc.collect()
encoded_data_test_sparse = csr_matrix(encoded_data_test, dtype=np.float32) # Added dtype
gc.collect()

0

#### Step - 11 Feature Combination (Sparse)

In [65]:
combined_features_train = hstack([tfidf_matrix_train, numerical_scaled_train_sparse, encoded_data_train_sparse])
combined_features_test = hstack([tfidf_matrix_test, numerical_scaled_test_sparse, encoded_data_test_sparse])

print("Combined features (train) shape:", combined_features_train.shape)
print("Combined features (test) shape:", combined_features_test.shape)

Combined features (train) shape: (7492, 1013)
Combined features (test) shape: (1874, 1013)


###  Step - 12 - Prepare model - HYPERPARAMETER TUNING

In [66]:

param_grid = {
    'n_estimators': [100, 200], # Reduce n_estimators
    'max_depth': [None, 10], #Reduce max_depth
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [67]:
# Step -13 - train the model (USING BEST MODEL)

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # 3-fold cross-validation
                           scoring='neg_mean_squared_error',  # or 'r2'
                           verbose=2,
                           n_jobs=-1)  

In [68]:
# Use all available cores

grid_search.fit(combined_features_train, y_train)

best_model = grid_search.best_estimator_  # Use the best model

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [69]:
# Step -13 - train the model (USING BEST MODEL)
# best_model.fit(combined_features_train, y_train) 
# No need to fit again, GridSearchCV already did

### Step - 14 # --- Make Predictions ---

In [70]:
y_pred = best_model.predict(combined_features_test)

### Step - 15. Evaluation

In [81]:
# --- Evaluate the Model ---

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 0.1979
Mean Absolute Error: 0.2947
R-squared: 0.1971


#### Step -16. Recommendation Function

In [82]:
def recommend_apps(app_name, num_recommendations=5):
    """
    Recommends similar apps based on feature similarity using the trained model.
    """
    try:
        # Get the features of the target app
        #app_index = X[X['App'] == app_name].index[0]

        # Find the app in the *original* dataframe
        app_data = dataset[dataset['App'] == app_name].iloc[0].to_frame().T # Get the row of the given app, from the parent dataset
        app_data = app_data.drop('Rating', axis=1) # Remove Rating from this entry for processing.

        # Impute and encode features
        for col in numerical_features: #Impute numerical features
            app_data[col] = app_data[col].fillna(X_train[col].median())
        numerical_scaled_app = scaler.transform(app_data[numerical_features]) #Scale numerical features
        encoded_data_app = encoder.transform(app_data[categorical_features]) #Encode categorical features
        tfidf_matrix_app = tfidf_vectorizer.transform(app_data['Features']) #Create the TFIDF matrix

        #Ensure matrixes are sparse
        numerical_scaled_app_sparse = csr_matrix(numerical_scaled_app)
        encoded_data_app_sparse = csr_matrix(encoded_data_app)
        combined_features_app = hstack([tfidf_matrix_app, numerical_scaled_app_sparse, encoded_data_app_sparse])

        # Make Prediction
        predicted_rating = best_model.predict(combined_features_app)[0]
        print(f"Predicted rating for the app: {predicted_rating:.2f}")

        #Compute Similarity
        similarity_scores = cosine_similarity(combined_features_app, combined_features_train)
        similar_app_indices = similarity_scores.argsort()[0][-(num_recommendations + 1):-1][::-1]

        # Print Results
        recommended_apps = X_train.iloc[similar_app_indices]['App'].values
        print(f"Recommended apps similar to {app_name}:")
        for app in recommended_apps:
            print(app)

    except IndexError:
        print(f"App '{app_name}' not found in the dataset.")
    except Exception as e:
        print(f"An error occurred: {e}")


#### step 17 - Example usage:


In [86]:
recommend_apps('ibis Paint X', num_recommendations=5)

Predicted rating for the app: 4.46
Recommended apps similar to ibis Paint X:
FlipaClip - Cartoon animation
Textgram - write on photos
Textgram - write on photos
Infinite Painter
SketchBook - draw and paint


### step 18 - Save model AND preprocessing objects into file


In [88]:
# Step 18 - Save model AND preprocessing objects into file (Updates)
model_data = {
    'model': best_model,
    'scaler': scaler,
    'encoder': encoder,
    'tfidf_vectorizer': tfidf_vectorizer,
    'numerical_features': numerical_features,  # List of numerical features
    'categorical_features': categorical_features,  # List of categorical features
    'apps_list': X_train['App'].tolist()  # List of app names
}

with open('app_recommender_model.pkl', 'wb') as file:
    pickle.dump(model_data, file)

print("Model and components saved successfully")


Model and components saved successfully
