In [4]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip show pandas

Name: pandasNote: you may need to restart the kernel to use updated packages.

Version: 2.2.2
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of

In [5]:
import pandas as pd


In [6]:
import numpy as np
import pandas as pd
from faker import Faker
from datetime import date, timedelta
import random

fake = Faker('en_GB')

# Defining a list of UK counties
uk_counties = ['West Sussex', 'West Yorkshire', 'Wiltshire', 'Worcestershire','Northumberland', 'Nottinghamshire', 'Oxfordshire',
               'Rutland', 'Shropshire', 'Somerset', 'South Yorkshire', 'Staffordshire', 'Suffolk',
               'Surrey', 'Tyne and Wear', 'Warwickshire', 'West Midlands', 'West Sussex', 'West Yorkshire',
               'Wiltshire', 'Worcestershire', 'South Yorkshire', 'Staffordshire', 'Suffolk',
               'Surrey', 'Tyne and Wear', 'Warwickshire', 'West Midlands', 'West Sussex', 'West Yorkshire',
               'Wiltshire', 'Worcestershire','West Sussex', 'West Yorkshire',
               'Wiltshire', 'Worcestershire','West Sussex', 'West Yorkshire',
               'Wiltshire', 'Worcestershire','Worcestershire','Worcestershire','Worcestershire','Worcestershire','Worcestershire','Worcestershire']

# Function to calculate arrears status based on multiple factors
def calculate_arrears_status(employment_status, income_level, property_condition, rental_price,
                             satisfaction, socio_economic_background, household_size, presence_of_guarantor,
                             credit_score, tenancy_by_entirety, benefit_cap, move_in_date, lease_end_date,
                             property_age, location, num_amenities, age, lease_start_date, property_type,
                             highest_education, income_debt_ratio, timeliness_score, rental_price_income_ratio):
    # Calculate arrears probability based on multiple factors
    arrears_factors = [
        (employment_status in ['Unemployed', 'Student'], 0.14),
        (income_level < 30000, 0.2),
        (gender == 'Male', 0.2),
        (property_condition == 'Outdated', 0.1),
        (rental_price > 2000, 0.1),  # Higher rental price increases arrears probability
        (satisfaction < 3, 0.1),  # Low satisfaction increases arrears probability
        (income_level < 20000 and socio_economic_background == 'Low', 0.2),  # Low income and low socio-economic background increase arrears probability
        (household_size > 4, 0.15),  # Large household size increases arrears probability
        (presence_of_guarantor == 'No', 0.2),  # No guarantor increases arrears probability
        (credit_score < 600, 0.25),  # Poor credit score increases arrears probability
        (tenancy_by_entirety == 'No', 0.15),  # Not tenancy by entirety increases arrears probability
        (benefit_cap == 'Yes', -0.1),  # Benefit cap decreases arrears probability
        (move_in_date.year < 2020, 0.1),  # Move-in date before 2020 increases arrears probability
        (lease_end_date.year < 2025, 0.1),  # Lease end date before 2025 increases arrears probability
        (property_age > 20, 0.1),  # Older properties increase arrears probability
        (location in ['West Sussex', 'Worcestershire'], 0.2),  # Properties in London or Manchester increase arrears probability
        (num_amenities == 1, 0.1),  # Properties with fewer amenities increase arrears probability
        (age > 65, 0.15),  # Older tenants increase arrears probability
        (lease_start_date.year < 2010, 0.2),  # Lease start date before 2010 increases arrears probability
        (property_type == 'Apartment', 0.2),  # Apartments increase arrears probability
        (socio_economic_background == 'Low' and highest_education == 'High School', 0.1),  # Low socio-economic background with high school education increases arrears probability
        (rental_price_income_ratio > 0.3, 0.15),  # High rental price to income ratio increases arrears probability
        (age < 30, 0.2),  # Young tenants increase arrears probability
        (income_level > 80000, -0.1),  # High income decreases arrears probability
        (property_age < 5, -0.1),  # Newer properties decrease arrears probability
        (num_amenities >= 4, -0.1),  # Properties with more amenities decrease arrears probability
        (household_size == 1, -0.1),  # Single tenant households decrease arrears probability
        (presence_of_guarantor == 'Yes', -0.2),  # Presence of guarantor decreases arrears probability
        (income_debt_ratio > 2, 0.2),  # High income to debt ratio increases arrears probability
        (timeliness_score < 50, 0.2),  # Low timeliness score increases arrears probability
        (age > 50 and satisfaction < 4, 0.1),  # Older tenants with low satisfaction increase arrears probability
        (credit_score < 500 and income_level < 40000, 0.2),  # Poor credit score and low income increase arrears probability
        (property_age > 30 and property_condition == 'Outdated', 0.1),  # Older and outdated properties increase arrears probability
    ]

    # Calculate the weighted sum of arrears factors
    arrears_prob = sum(weight * (1 if condition else 0) for condition, weight in arrears_factors)

    # Setting the threshold for active arrears status
    threshold = 0.99

    # Setting the probability of having an active arrears status
    arrears_status = 'Active' if arrears_prob > threshold else 'Inactive'

    return arrears_status



combined_data = []
num_tenants = 100000

for i in range(num_tenants):
    tenant_id = f"TEN-{i+1:05}"  # Generate unique tenant ID
    age = int(np.random.normal(50, 15))  # Generate age from a normal distribution (mean=50, std=15)
    age = max(18, min(90, age))  # Ensuring age is within a reasonable range (18 to 90)

    gender = random.choice(['Male', 'Female'])  # Randomly select gender

    # Adjust employment status based on age
    if age < 45:
        employment_status = random.choice(['Employed', 'Unemployed', 'Student'])
    elif age >= 75:  # Age from 75 up have employment status 'Retired'
        employment_status = 'Retired'
    else:
        employment_status = random.choice(['Employed', 'Unemployed'])

    # Generate income level based on employment status
    if employment_status == 'Unemployed':
        income_level = random.randint(10000, 30000)
    elif employment_status == 'Student':
        income_level = random.randint(5000, 20000)
    elif employment_status == 'Retired':
        income_level = random.randint(20000, 50000)
    else:
        income_level = random.randint(50000, 140000)

     # Generate socioeconomic background level based on income_level
    if income_level < 20000:
        socio_economic_background = random.choice(['Low', 'Medium', 'Low', 'Low'])
    elif income_level >= 100000:
        socio_economic_background = random.choice(['High', 'Medium', 'High', 'High'])
    else:
        socio_economic_background = random.choice(['Medium', 'Medium', 'High', 'Medium'])
        
    # Enhancing socio-economic background to reflect real-world distributions
    #socio_economic_background = random.choices(['Low', 'Medium', 'Medium', 'High'], weights=[0.3, 0.4, 0.4, 0.3])[0]

    # Enhancing education levels with fewer Ph.D. and Master's degrees
    #education_choices = ['High School', 'Bachelor', 'Bachelor', 'Master', 'PhD']
    #highest_education = random.choices(education_choices, weights=[0.4, 0.3, 0.3, 0.15, 0.05])[0]
    # Enhancing education levels with fewer Ph.D. and Master's degrees
    if age < 25:
        highest_education = random.choice(['High School', 'High School', 'High School', 'Bachelor'])
    elif age < 35:
        highest_education = random.choice(['High School', 'Bachelor', 'Bachelor', 'Master'])
    elif age < 60:
        highest_education = random.choice(['Bachelor', 'Bachelor', 'Master', 'PhD'])
    else:
        highest_education = random.choice(['Master', 'PhD', 'PhD', 'PhD'])

    # Generating lease start date and move-in date with at least a month difference
    lease_start_date = fake.date_between(start_date='-5y', end_date='today')

    # Ensuring lease start date is at least 31 days before today
    while (date.today() - lease_start_date).days < 31:
        lease_start_date = fake.date_between(start_date='-5y', end_date='today')

    move_in_date = fake.date_between(start_date=lease_start_date + timedelta(days=30), end_date='today')

    lease_end_date = fake.date_between(start_date='+2y', end_date='+10y')  # Generate lease end date
    location = random.choice(uk_counties)  # Random UK county as location
    property_type = random.choice(['Apartment', 'House', 'Condominium'])  # Random property type

    size = int(np.random.normal(1500, 500))  # Generating property size from a normal distribution (mean=1500, std=500)
    size = max(500, min(3000, size))  # Ensuring property size is within a reasonable range (500 to 3000)
    
    if property_type == 'Apartment':
        num_amenities = random.randint(2, 4)
    elif property_type == 'House':
        num_amenities = random.randint(1, 2)
    else:
        num_amenities = random.randint(4, 5)
        

    #num_amenities = random.randint(1, 5)  # Random number of amenities (1 to 5)
    rental_price = int(np.random.normal(1500, 500))  # Monthly rent price
    rental_price = max(500, min(3000, rental_price))  # Ensuring rental price is within a reasonable range

    property_age = random.randint(1, 50)  # Age of the property in years

    # Simulating property condition based on property age and other factors
    if property_age <= 5:
        property_condition = random.choice(['New', 'Renovated'])  # New or recently renovated
    elif property_age <= 25:
        property_condition = random.choice(['Renovated', 'Outdated'])  # Renovated or outdated
    else:
        property_condition = 'Outdated'  # Older properties are more likely to be outdated

    tenancy_by_entirety = random.choice(['Yes', 'No'])  # Tenancy by entirety
    benefit_cap = random.choice(['Yes', 'No'])  # Benefit cap

    # Generate satisfaction level with higher income leading to higher satisfaction
    satisfaction = random.randint(1, 5)  # Generate satisfaction level (1 to 5)

    # Calculate additional metrics
    # Income to Debt Ratio (assuming random debt between 0 and 2 times the income)
    debt = random.uniform(0, 2) * income_level
    income_debt_ratio = income_level / max(1, debt)

    # Timeliness Score (random score between 0 and 100)
    timeliness_score = random.randint(0, 100)

    # Credit Score (assuming credit score ranges from 300 to 850)
    credit_score = random.randint(300, 850)

    # Household Size (random size between 1 and 5)
    household_size = random.randint(1, 5)

    # Presence of Guarantor (randomly Yes or No)
    presence_of_guarantor = random.choice(['Yes', 'No'])

    # Ratio of Rental Price to Income
    rental_price_income_ratio = rental_price / max(1, income_level)

    # Calculate arrears status based on all factors
    arrears_status = calculate_arrears_status(employment_status, income_level, property_condition, rental_price,
                                               satisfaction, socio_economic_background, household_size,
                                               presence_of_guarantor, credit_score, tenancy_by_entirety,
                                               benefit_cap, move_in_date, lease_end_date, property_age,
                                               location, num_amenities, age, lease_start_date, property_type,
                                               highest_education, income_debt_ratio, timeliness_score,
                                               rental_price_income_ratio)

    # Appending data to combined_data list
    combined_data.append([tenant_id, age, gender, employment_status, income_level,
                          socio_economic_background, highest_education, move_in_date,
                          lease_start_date, lease_end_date, location, property_type,
                          size, num_amenities, rental_price, property_age, property_condition,
                          tenancy_by_entirety, benefit_cap, satisfaction, arrears_status,
                          income_debt_ratio, timeliness_score, credit_score,
                          household_size, presence_of_guarantor, rental_price_income_ratio])

# Converting data to DataFrame
combined_df = pd.DataFrame(combined_data, columns=['Tenant ID', 'Tenant Age', 'Gender',
                                                   'Employment Status', 'Income Level',
                                                   'Socio-economic Background', 'Highest Education',
                                                   'Move-in Date', 'Lease Start Date', 'Lease End Date',
                                                   'Location', 'Property Type', 'Property Size',
                                                   'Num Amenities', 'Rental Price', 'Property Age',
                                                   'Property Condition', 'Tenancy by Entirety',
                                                   'Benefit Cap', 'Satisfaction', 'Arrears Status',
                                                   'Income to Debt Ratio', 'Timeliness Score', 'Credit Score',
                                                   'Household Size', 'Presence of Guarantor', 'Rental Price to Income Ratio'])

# Saving DataFrame to CSV
combined_df.to_csv("tenant_combined_data_realistic_with_metrics_test.csv", index=False)


In [2]:
df=pd.read_csv('tenant_combined_data_realistic_with_metrics1.csv')
df.head()

Unnamed: 0,Tenant ID,Tenant Age,Gender,Employment Status,Income Level,Socio-economic Background,Highest Education,Move-in Date,Lease Start Date,Lease End Date,...,Tenancy by Entirety,Benefit Cap,Satisfaction,Arrears Status,Income to Debt Ratio,Timeliness Score,Credit Score,Household Size,Presence of Guarantor,Rental Price to Income Ratio
0,TEN-00001,70,Female,Unemployed,17862,Low,PhD,2024-03-17,2024-02-07,2034-03-04,...,Yes,No,3,Active,13.816965,70,346,1,No,0.114657
1,TEN-00002,47,Male,Unemployed,11882,Low,Master,2024-02-23,2023-05-05,2029-07-11,...,No,Yes,2,Active,0.927262,36,681,2,No,0.11286
2,TEN-00003,33,Male,Unemployed,24529,Medium,Bachelor,2023-12-07,2019-03-31,2026-10-20,...,Yes,Yes,1,Active,0.612466,42,500,3,No,0.057442
3,TEN-00004,29,Female,Employed,89782,Medium,Master,2023-11-29,2021-02-09,2031-08-25,...,No,Yes,5,Inactive,148.188381,100,559,1,Yes,0.019503
4,TEN-00005,53,Male,Unemployed,26066,Medium,Bachelor,2022-09-18,2019-07-08,2028-04-26,...,No,No,1,Active,0.562226,8,842,5,Yes,0.071319
