In [None]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Define the sub-categories for groceries
sub_categories = ['Fruits', 'Vegetables', 'Dairy', 'Snacks', 'Beverages', 'Grains', 'Meat', 'Spices', 'Packaged Foods', 'Frozen Foods']

# Define possible rating scores and discount status
rating_scores = [3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5]
has_discount = [0, 1]  # 0: No Discount, 1: Discount

# Define the ranges for Price in INR for different categories
price_ranges = {
    'Fruits': (20, 300),
    'Vegetables': (10, 150),
    'Dairy': (40, 200),
    'Snacks': (20, 100),
    'Beverages': (30, 500),
    'Grains': (50, 1000),
    'Meat': (100, 2000),
    'Spices': (10, 200),
    'Packaged Foods': (50, 800),
    'Frozen Foods': (50, 400)
}

# Define the ranges for Num_reviews
num_reviews_range = (50, 1000)

# Function to generate synthetic data
def generate_data(num_rows):
    data = []
    for _ in range(num_rows):
        sub_category = np.random.choice(sub_categories)
        price = np.random.randint(price_ranges[sub_category][0], price_ranges[sub_category][1])
        has_discount_value = np.random.choice(has_discount)
        rating = np.random.choice(rating_scores)
        num_reviews = np.random.randint(num_reviews_range[0], num_reviews_range[1])

        # Determine the popularity based on rating, number of reviews, and discount
        if rating >= 4.5 and num_reviews > 300:
            popularity = 'High'
        elif rating >= 4.0 and num_reviews >= 100:
            popularity = 'Medium'
        else:
            popularity = 'Low'

        data.append([sub_category, price, has_discount_value, rating, num_reviews, popularity])

    # Create a DataFrame from the generated data
    df = pd.DataFrame(data, columns=['Sub Category', 'Price (INR)', 'Has_Discount', 'Rating_score', 'Num_reviews', 'Popularity'])
    return df

# Generate a dataset of 1500 rows
dataset = generate_data(1500)

# Save to a CSV file (optional)
dataset.to_csv('popularity_clean.csv', index=False)

# Display the first few rows of the dataset
print(dataset.head())


  Sub Category  Price (INR)  Has_Discount  Rating_score  Num_reviews  \
0         Meat         1559             0           4.5          121   
1    Beverages          132             1           3.7          264   
2       Spices          126             1           4.2          713   
3        Dairy          189             0           3.6          393   
4       Grains          435             1           3.9          210   

  Popularity  
0     Medium  
1        Low  
2     Medium  
3        Low  
4        Low  


In [None]:
import pandas as pd
import numpy as np

# Define parameters
num_rows = 2000  # Set the number of rows to 2000
categories = ['Fruits', 'Dairy', 'Snacks', 'Beverages', 'Vegetables']
price_range = [20, 1000]  # Price range in INR

# Create random data
user_ids = np.random.randint(1, 101, num_rows)  # Random User IDs between 1 and 100
item_ids = np.random.randint(100, 500, num_rows)  # Random Item IDs between 100 and 500
categories_col = np.random.choice(categories, num_rows)  # Random categories
prices = np.random.randint(price_range[0], price_range[1], num_rows)  # Random prices
cf_scores = np.random.uniform(1, 5, num_rows)  # Random CF Scores between 1 and 5
cbf_scores = np.random.uniform(0, 1, num_rows)  # Random CBF Scores between 0 and 1

# Calculate Final Score (Weighted Average)
final_scores = 0.7 * cf_scores + 0.3 * cbf_scores  # Weighted sum of CF and CBF

# Create DataFrame
data = {
    'User ID': user_ids,
    'Item ID': item_ids,
    'Category': categories_col,
    'Price (INR)': prices,
    'CF Score': cf_scores,
    'CBF Score': cbf_scores,
    'Final Score': final_scores
}

df = pd.DataFrame(data)

# Save to CSV (optional)
df.to_csv('recommendation_dataset.csv', index=False)

# Show the first few rows
print(df.head())


   User ID  Item ID    Category  Price (INR)  CF Score  CBF Score  Final Score
0       85      111  Vegetables          515  3.570984   0.231845     2.569242
1       21      336   Beverages          516  4.570970   0.108186     3.232135
2       18      421      Snacks          524  4.539496   0.640917     3.369923
3       55      357  Vegetables          551  3.798178   0.523183     2.815679
4       29      498   Beverages          988  2.863081   0.899145     2.273900


In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set the random seed for reproducibility
np.random.seed(42)

# Define parameters
num_rows = 2000  # 2,000 rows for the dataset
categories = ['Fruits', 'Dairy', 'Beverages', 'Snacks', 'Vegetables']
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
seasonality = ['Holiday', 'Regular', 'Peak Season', 'Off-Season']

# Generate random dates over the past year
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(days=random.randint(0, 365)) for _ in range(num_rows)]

# Create random product data
product_ids = np.random.randint(100, 200, num_rows)
prices = np.random.randint(20, 1000, num_rows)
quantities_sold = np.random.randint(1, 50, num_rows)
discounts = np.random.randint(0, 30, num_rows)  # Discount between 0% and 30%
promotions = np.random.choice(['Yes', 'No'], num_rows)  # Random promotion
store_location = ['Bangalore'] * num_rows  # All store locations set to Bangalore
day_of_week = np.random.choice(days_of_week, num_rows)
season = np.random.choice(seasonality, num_rows)

# Calculate Sales (Target Variable: Price * Quantity Sold)
sales = prices * quantities_sold * (1 - discounts / 100)

# Create DataFrame
data = {
    'Date': dates,
    'Product ID': product_ids,
    'Product Category': np.random.choice(categories, num_rows),
    'Price (INR)': prices,
    'Quantity Sold': quantities_sold,
    'Discount (%)': discounts,
    'Promotions': promotions,
    'Store Location': store_location,
    'Day of the Week': day_of_week,
    'Seasonality': season,
    'Sales (Target)': sales
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv('sales_prediction_dataset.csv', index=False)

# Show the first few rows
print(df.head())


        Date  Product ID Product Category  Price (INR)  Quantity Sold  \
0 2024-02-07         151        Beverages          607             45   
1 2024-12-18         192            Dairy          908             26   
2 2024-12-02         114       Vegetables           65             39   
3 2024-09-07         171           Snacks          691             28   
4 2024-06-19         160           Fruits          482              5   

   Discount (%) Promotions Store Location Day of the Week  Seasonality  \
0            16        Yes      Bangalore          Sunday  Peak Season   
1            27         No      Bangalore         Tuesday      Regular   
2            24         No      Bangalore          Monday  Peak Season   
3            28         No      Bangalore          Sunday   Off-Season   
4            15         No      Bangalore          Sunday      Holiday   

   Sales (Target)  
0        22944.60  
1        17233.84  
2         1926.60  
3        13930.56  
4         2048.5

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set the random seed for reproducibility
np.random.seed(42)

# Define parameters
num_rows = 2000  # 2,000 rows for the dataset
categories = ['Fruits', 'Dairy', 'Beverages', 'Snacks', 'Vegetables']
customer_segments = ['Budget Shopper', 'Premium Shopper', 'Regular Shopper']

# Generate random dates over the past year
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(days=random.randint(0, 365)) for _ in range(num_rows)]

# Create random product data
customer_ids = np.random.randint(1, 501, num_rows)  # Random Customer IDs between 1 and 500
product_ids = np.random.randint(100, 200, num_rows)  # Random Product IDs between 100 and 200
prices = np.random.randint(20, 1000, num_rows)  # Random prices between 20 INR and 1000 INR
quantities_sold = np.random.randint(1, 50, num_rows)  # Random quantity sold between 1 and 50
customer_segment = np.random.choice(customer_segments, num_rows)  # Random customer segments

# Calculate Sales (Target Variable: Price * Quantity Sold)
sales = prices * quantities_sold

# Create DataFrame
data = {
    'Customer ID': customer_ids,
    'Customer Segment': customer_segment,
    'Product ID': product_ids,
    'Product Category': np.random.choice(categories, num_rows),
    'Price (INR)': prices,
    'Quantity Sold': quantities_sold,
    'Sales (Target)': sales
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv('segmentation_cleaned.csv', index=False)

# Show the first few rows
print(df.head())


   Customer ID Customer Segment  Product ID Product Category  Price (INR)  \
0          103  Premium Shopper         130            Dairy          723   
1          436   Budget Shopper         123           Snacks          337   
2          349   Budget Shopper         198        Beverages          637   
3          271  Premium Shopper         159           Snacks          382   
4          107  Regular Shopper         144           Snacks          841   

   Quantity Sold  Sales (Target)  
0             22           15906  
1             42           14154  
2             12            7644  
3             42           16044  
4             13           10933  


In [8]:
df = pd.read_csv('/content/segmentation_cleaned.csv')

In [9]:
df.head()

Unnamed: 0,Customer ID,Customer Segment,Product ID,Product Category,Price (INR),Quantity Sold,Sales (Target)
0,103,Premium Shopper,130,Dairy,723,22,15906
1,436,Budget Shopper,123,Snacks,337,42,14154
2,349,Budget Shopper,198,Beverages,637,12,7644
3,271,Premium Shopper,159,Snacks,382,42,16044
4,107,Regular Shopper,144,Snacks,841,13,10933


In [16]:
df = pd.read_csv('/content/sales_prediction_dataset.csv')

In [17]:
df.head()

Unnamed: 0,Date,Product ID,Product Category,Price (INR),Quantity Sold,Discount (%),Promotions,Store Location,Day of the Week,Seasonality,Sales (Target)
0,2024-02-07,151,Beverages,607,45,16,Yes,Bangalore,Sunday,Peak Season,22944.6
1,2024-12-18,192,Dairy,908,26,27,No,Bangalore,Tuesday,Regular,17233.84
2,2024-12-02,114,Vegetables,65,39,24,No,Bangalore,Monday,Peak Season,1926.6
3,2024-09-07,171,Snacks,691,28,28,No,Bangalore,Sunday,Off-Season,13930.56
4,2024-06-19,160,Fruits,482,5,15,No,Bangalore,Sunday,Holiday,2048.5


In [39]:
# Import necessary libraries
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset (replace with the actual dataset path)
DATA_PATH = '/content/popularity_clean.csv'  # Replace with your dataset path
df = pd.read_csv(DATA_PATH)

# Check the structure of the dataset
print(df.head())

# Preprocess the data

# Encode categorical features (if necessary)
label_encoder_sub_category = LabelEncoder()
df['Sub Category'] = label_encoder_sub_category.fit_transform(df['Sub Category'])

label_encoder_popularity = LabelEncoder()
df['Popularity'] = label_encoder_popularity.fit_transform(df['Popularity'])  # Encoding target

# Define features (X) and target (y)
X = df[['Sub Category', 'Price (INR)', 'Has_Discount', 'Rating_score', 'Num_reviews']]  # Features
y = df['Popularity']  # Target variable

# Scaling the features (optional but recommended)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train a RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Save the trained model and scaler
MODEL_PATH = '/content/popularity_model.pkl'
SCALER_PATH = '/content/popularity_scaler.pkl'
joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

print(f'Model saved to: {MODEL_PATH}')
print(f'Scaler saved to: {SCALER_PATH}')


  Sub Category  Price (INR)  Has_Discount  Rating_score  Num_reviews  \
0         Meat         1559             0           4.5          121   
1    Beverages          132             1           3.7          264   
2       Spices          126             1           4.2          713   
3        Dairy          189             0           3.6          393   
4       Grains          435             1           3.9          210   

  Popularity  
0     Medium  
1        Low  
2     Medium  
3        Low  
4        Low  
Model Accuracy: 100.00%
Model saved to: /content/popularity_model.pkl
Scaler saved to: /content/popularity_scaler.pkl


In [42]:
# Load the dataset (replace with the actual dataset path)
DATA_PATH = '/content/popularity_clean.csv'  # Replace with your dataset path
df = pd.read_csv(DATA_PATH)

In [43]:
df['Sub Category'].value_counts()

Unnamed: 0_level_0,count
Sub Category,Unnamed: 1_level_1
Vegetables,170
Snacks,162
Dairy,150
Fruits,150
Beverages,148
Meat,146
Spices,146
Grains,144
Frozen Foods,142
Packaged Foods,142


In [38]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the dataset (replace with the actual dataset path)
DATA_PATH = '/content/recommendation_dataset.csv'  # Replace with your dataset path
df = pd.read_csv(DATA_PATH)

# Check the structure of the dataset
print(df.head())

# Preprocess the data

# Features (X) and target (y)
X = df[['User ID', 'Item ID', 'Category', 'Price (INR)', 'CF Score', 'CBF Score']]  # Features
y = df['Final Score']  # Target variable

# One-hot encode the 'Category' column since it's categorical
X = pd.get_dummies(X, columns=['Category'], drop_first=True)

# Scaling the features (optional but recommended)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# Save the trained model and scaler
MODEL_PATH = '/content/recommendation_model.pkl'
SCALER_PATH = '/content/recomend_scaler.pkl'
joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

print(f'Model saved to: {MODEL_PATH}')
print(f'Scaler saved to: {SCALER_PATH}')
from sklearn.preprocessing import OneHotEncoder
import joblib

# One-Hot Encoder for Category
encoder = OneHotEncoder(drop='first')
encoder.fit(df[['Category']])

# Save the encoder
joblib.dump(encoder, '/content/category_encoder.pkl')



   User ID  Item ID    Category  Price (INR)  CF Score  CBF Score  Final Score
0       85      111  Vegetables          515  3.570984   0.231845     2.569242
1       21      336   Beverages          516  4.570970   0.108186     3.232135
2       18      421      Snacks          524  4.539496   0.640917     3.369923
3       55      357  Vegetables          551  3.798178   0.523183     2.815679
4       29      498   Beverages          988  2.863081   0.899145     2.273900
Mean Squared Error (MSE): 0.00032292417607019926
Model saved to: /content/recommendation_model.pkl
Scaler saved to: /content/recomend_scaler.pkl


['/content/category_encoder.pkl']

In [44]:
DATA_PATH = '/content/recommendation_dataset.csv'  # Replace with your dataset path
df = pd.read_csv(DATA_PATH)

In [46]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Vegetables,443
Fruits,398
Beverages,397
Snacks,387
Dairy,375


In [21]:
df.head()

Unnamed: 0,User ID,Item ID,Category,Price (INR),CF Score,CBF Score,Final Score
0,85,111,Vegetables,515,3.570984,0.231845,2.569242
1,21,336,Beverages,516,4.57097,0.108186,3.232135
2,18,421,Snacks,524,4.539496,0.640917,3.369923
3,55,357,Vegetables,551,3.798178,0.523183,2.815679
4,29,498,Beverages,988,2.863081,0.899145,2.2739


In [32]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load the dataset (replace with the actual dataset path)
DATA_PATH = '/content/segmentation_cleaned.csv'  # Update with the actual dataset path in Colab
df = pd.read_csv(DATA_PATH)

# Check the structure of the dataset
print(df.head())

# Preprocess the data

# Clean and select useful features
df_cleaned = df.dropna()  # Remove rows with missing values (if any)

# Select relevant features for segmentation
features = df_cleaned[['Price (INR)', 'Quantity Sold', 'Sales (Target)']]

# Normalize the features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Perform KMeans clustering (segmentation)
n_clusters = 4  # You can choose the number of clusters based on your requirement
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)

# Fit the model
kmeans.fit(scaled_features)

# Add the cluster labels to the original dataframe
df_cleaned['Segment'] = kmeans.labels_

# Check the cluster centers (optional)
print("Cluster centers:\n", kmeans.cluster_centers_)

# Save the trained model and scaler
MODEL_PATH = '/content/segmentation_model.pkl'
SCALER_PATH = '/content/segment_scaler.pkl'
joblib.dump(kmeans, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

print(f'Model saved to: {MODEL_PATH}')
print(f'Scaler saved to: {SCALER_PATH}')

# Optionally, print out the first few rows of the segmented data
print(df_cleaned.head())


   Customer ID Customer Segment  Product ID Product Category  Price (INR)  \
0          103  Premium Shopper         130            Dairy          723   
1          436   Budget Shopper         123           Snacks          337   
2          349   Budget Shopper         198        Beverages          637   
3          271  Premium Shopper         159           Snacks          382   
4          107  Regular Shopper         144           Snacks          841   

   Quantity Sold  Sales (Target)  
0             22           15906  
1             42           14154  
2             12            7644  
3             42           16044  
4             13           10933  
Cluster centers:
 [[ 0.75694259 -0.86771062 -0.32969172]
 [-0.96433543 -0.80046035 -0.89266928]
 [-0.81070253  0.86355473 -0.23741679]
 [ 0.9046374   0.88640909  1.45730142]]
Model saved to: /content/segmentation_model.pkl
Scaler saved to: /content/segment_scaler.pkl
   Customer ID Customer Segment  Product ID Product Categor