In [1]:
from faker import Faker
import pandas as pd
import random
# Generate fake user data
fake = Faker('en_US')
Faker.seed(0)

# Set the number of fake users you want to generate
num_users = 2000  # Adjust this to the size of dataset you want

# Create lists to store the generated data
user_ids = []
ages = []
genders = []
cities = []
countries = []

for _ in range(num_users):
    user_ids.append(fake.unique.random_number(digits=5, fix_len=True))  # Unique 5-digit user ID
    ages.append(random.randint(18, 60))  # Age between 18 and 60
    genders.append(random.choice(['male', 'female']))  # Randomly select gender
    city = fake.city().lower()
    country = 'us'
    cities.append(city)
    countries.append(country)

# Create a DataFrame
user_data = pd.DataFrame({
    'userID': user_ids,
    'age': ages,
    'gender': genders,
    'city': cities,
    'country': countries
})

# Function to calculate similarity between two users
def calculate_similarity(user1, user2):
    # Gender similarity
    gender_sim = 1 if user1['gender'] == user2['gender'] else 0
    
    # Age similarity (inverted difference, assuming max age difference of 100)
    age_sim = 1 - abs(user1['age'] - user2['age']) / 100
    
    # Location similarity (city > country)
    if user1['city'] == user2['city']:
        location_sim = 1  # Same city
    elif user1['country'] == user2['country']:
        location_sim = 0.5  # Same country, different city
    else:
        location_sim = 0  # Different country
    
    # Combine the similarities with weighting
    total_similarity = 0.2 * gender_sim + 0.5 * age_sim + 0.3 * location_sim
    return total_similarity

# Function to find the top 10 most similar users based on input user info
def find_top10_similar_users(user_info, user_data):
    similarities = []
    
    # Calculate similarity with all users in the dataset
    for i, other_user in user_data.iterrows():
        similarity = calculate_similarity(user_info, other_user)
        similarities.append((other_user['userID'], other_user['age'], other_user['gender'], other_user['city'], other_user['country'], similarity))
    
    # Sort by similarity score in descending order and get the top 10
    similarities.sort(key=lambda x: x[5], reverse=True)
    top10_similar_users = similarities[:10]
    
    # Print the top 10 most similar users and their similarity scores
    print("Top 10 similar users:")
    for userid, age, gender, city, country, similarity_score in top10_similar_users:
        print(f"User ID: {userid}, Age: {age}, Gender: {gender}, City: {city}, Country: {country}, Similarity Score: {similarity_score}")

# Example user information to test the function
user_info = {
    'age': 25,
    'gender': 'female',
    'city': 'hochiminh',
    'country': 'vietnam'
}

# # Call the function with the user information
user_data
user_data.to_csv('D:/RecommendModel/user_data.csv')

In [2]:
product_data=pd.read_csv('../train_data.csv')
num_entries = 6000
random_data = {
    'userid': [random.choice(user_data['userID']) for _ in range(num_entries)],
    'productid': [random.choice(product_data['id']) for _ in range(num_entries)],
    'main_category': [],
    'sub_category': [],
    'rating': [round(random.uniform(1, 5), 1) for _ in range(num_entries)]  # Random rating between 1.0 and 5.0
}

# Populate main_category and sub_category based on the chosen productid
product_mapping = product_data.set_index('id')[['main_category', 'sub_category']].to_dict('index')

for product_id in random_data['productid']:
    main_category = product_mapping[product_id]['main_category']
    sub_category = product_mapping[product_id]['sub_category']
    random_data['main_category'].append(main_category)
    random_data['sub_category'].append(sub_category)

# Create the final DataFrame
ratings_data = pd.DataFrame(random_data)

# Display the first few rows
print(ratings_data.head())

   userid  productid        main_category           sub_category  rating
0   90187  hkt242817       home & kitchen              Furniture     2.0
1   68271  acs019412          accessories    Handbags & Clutches     4.1
2   82010  msh089295          men's shoes           Casual Shoes     4.9
3   69735  str226246               stores  The Designer Boutique     1.6
4   70976  tvc114892  tv, audio & cameras               Speakers     1.3


In [3]:
ratings_data = ratings_data.drop('Unnamed: 0', axis=1)
ratings_data.to_csv('D:/RecommendModel/ratings.csv')