# Real Estate Data Analysis – EDA & Cleaning

Not just reading numbers — asking what they mean for the business 

## Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

##  Address Dataset

### Load & Preview

In [None]:
address_df = pd.read_csv(r"\01_Dataset\addresses.csv")
address_df.head()

Unnamed: 0,address_id,street,city,state,latitude,longitude
0,1,5545 Maxwell Springs,Joliet,IL,41.59586,-88.086585
1,2,9774 Nicole Greens,Springfield,IL,39.784853,-89.655079
2,3,13699 Baldwin Wells,Elgin,IL,42.014133,-88.22813
3,4,5822 Kane Plaza,Chicago,IL,41.953988,-87.677038
4,5,710 Jesus Lakes,Tallahassee,FL,30.419741,-84.260663


In [3]:
address_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   address_id  3000 non-null   int64  
 1   street      3000 non-null   object 
 2   city        3000 non-null   object 
 3   state       3000 non-null   object 
 4   latitude    3000 non-null   float64
 5   longitude   3000 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 140.8+ KB


In [4]:
address_df.nunique()

address_id    3000
street        3000
city            40
state            5
latitude      2999
longitude     2998
dtype: int64

In [5]:
"""
no nulls / duplicates
datatypes are correct
The street is unique and represents the ID
All addresses in one country (USA), and 5 states
"""

'\nno nulls / duplicates\ndatatypes are correct\nThe street is unique and represents the ID\nAll addresses in one country (USA), and 5 states\n'

In [6]:
# data is cleaned
address_df.to_csv("addresses_clean.csv", index=False)

## Client dataset

### Load & Preview Data

In [None]:
client_df = pd.read_csv(r"clients.csv")
client_df.head()

Unnamed: 0,client_id,name,gender,phone_number,email,address_id,age
0,1,Emily Johnson,Female,899-378-9434,tyler17@example.org,2899,45
1,2,Diane Hanson,Female,001-738-239-3519x8660,nicole88@example.net,2056,25
2,3,Amy Johnson,Female,770-866-7647,bwatson@example.net,2382,57
3,4,Miranda Green,Female,(429)742-5377,contrerasgina@example.com,12,36
4,5,Kelsey Ballard,Female,001-778-757-2739,hproctor@example.net,1809,31


In [8]:
client_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   client_id     2060 non-null   int64 
 1   name          2060 non-null   object
 2   gender        2060 non-null   object
 3   phone_number  2054 non-null   object
 4   email         2053 non-null   object
 5   address_id    2060 non-null   int64 
 6   age           2060 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 112.8+ KB


In [9]:
client_df.duplicated().sum()

np.int64(0)

In [10]:
client_df.nunique()

client_id       2060
name            2034
gender             2
phone_number    2054
email           2041
address_id      1484
age               50
dtype: int64

In [11]:
"""
There are no duplicate clients, but there are shared phone numbers and email addresses.
Since I won't need them for the analysis, I'll drop these columns
"""

"\nThere are no duplicate clients, but there are shared phone numbers and email addresses.\nSince I won't need them for the analysis, I'll drop these columns\n"

### Data Cleaning

In [12]:
client_df.drop(columns=["email","phone_number"],inplace=True)

In [13]:
client_df.to_csv("client_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [14]:
client_df.describe(include="all")

Unnamed: 0,client_id,name,gender,address_id,age
count,2060.0,2060,2060,2060.0,2060.0
unique,,2034,2,,
top,,Michael Lawson,Female,,
freq,,2,1054,,
mean,1030.5,,,1490.234466,40.115534
std,594.815097,,,868.80334,11.183717
min,1.0,,,6.0,21.0
25%,515.75,,,723.75,32.0
50%,1030.5,,,1470.5,40.0
75%,1545.25,,,2238.75,48.0


In [15]:
"""
total clint : 2060,
mostly Female, 
aged 21-70, with an average of 40
"""

'\ntotal clint : 2060,\nmostly Female, \naged 21-70, with an average of 40\n'

### Data Visualization

In [16]:
fig1 = px.histogram(client_df, x="age", nbins=20, title="client age distibution")
fig1.show()

In [17]:
fig2 = px.pie(client_df,names='gender',title='client Gender Distribution')
fig2.show()

In [18]:
# most clients are concentrated in the age groups between 35-44, as well as between 50-54.
# Additionally, the proportion of female clients is higher than that of male clients.

## Agent Datset

### Load & Preview Data

In [None]:
agent_df = pd.read_csv(r"agents.csv")
agent_df.sample(10)

Unnamed: 0,agent_id,name,gender,age,email,phone,hire_date,agent_type
177,175,Thomas Calderon,Female,35,thomascalderon@gmail.com,314-600-0968x8290,7/2/2020,Sales Agent
220,218,Dylan Moore,Female,63,dylanmoore@gmail.com,(911)595-6635x342,7/20/2023,Sales Agent
70,68,Alicia Walsh,Male,44,aliciawalsh@hotmail.com,-5581,2/7/2024,Property Manager
114,112,Dr. Jennifer Meyer MD,Female,47,drjennifermeyermd@gmail.com,639-944-2396x108,11/11/2023,Sales Agent
45,45,Jacob Ware,Female,32,jacobware@yahoo.com,(274)492-6197,12/31/2020,Leasing Agent
102,100,Christian Anderson,Female,55,christiananderson@hotmail.com,(674)747-3435x736,9/9/2024,Leasing Agent
244,242,Kelsey Andrade,Male,44,kelseyandrade@gmail.com,(577)393-4491x5344,7/1/2021,Sales Agent
168,166,Gregory Wright,Female,39,gregorywright@hotmail.com,+1-856-415-5046x30389,8/29/2022,Property Manager
161,159,Jeffrey Khan,Male,63,jeffreykhan@gmail.com,(641)438-7325x88505,6/8/2020,Leasing Agent
11,12,Mary Walker,Female,40,marywalker@yahoo.com,829.629.6197x37011,7/11/2020,Leasing Agent


In [20]:
agent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   agent_id    253 non-null    int64 
 1   name        253 non-null    object
 2   gender      253 non-null    object
 3   age         253 non-null    int64 
 4   email       253 non-null    object
 5   phone       253 non-null    object
 6   hire_date   253 non-null    object
 7   agent_type  253 non-null    object
dtypes: int64(2), object(6)
memory usage: 15.9+ KB


In [21]:
agent_df.duplicated().sum()

np.int64(3)

In [22]:
print(agent_df[agent_df.duplicated()])

    agent_id            name gender  age                      email  \
30        13  Janice Johnson   Male   59    janicejohnson@gmail.com   
55        35    Kevin Robles   Male   36    kevinrobles@hotmail.com   
76        74  Maureen Nelson   Male   34  maureennelson@hotmail.com   

                    phone  hire_date     agent_type  
30      932.866.5342x5834  1/22/2024  Leasing Agent  
55  +1-854-249-3009x17595  10/6/2021  Leasing Agent  
76     419.262.9067x31571  4/14/2024    Sales Agent  


In [23]:
"""
There are duplicate records in agent dataset so i'll drop the dupicated record
and convert hire_date type to date
"""

"\nThere are duplicate records in agent dataset so i'll drop the dupicated record\nand convert hire_date type to date\n"

### Data Cleaning

In [24]:
agent_df.drop_duplicates(subset=['agent_id'], keep='first',inplace=True)

In [25]:
agent_df["hire_date"]=pd.to_datetime(agent_df["hire_date"])

In [26]:
# data is cleaned
agent_df.to_csv("agent_clean.csv", index=False)

### Descriptive Statistics & Group Insights

In [27]:
agent_df.describe(include="all")

Unnamed: 0,agent_id,name,gender,age,email,phone,hire_date,agent_type
count,250.0,250,250,250.0,250,250,250,250
unique,,248,2,,249,250,,3
top,,Michael Johnson,Female,,jennifergoodman@yahoo.com,678-586-6391x9639,,Leasing Agent
freq,,2,129,,2,1,,102
mean,125.5,,,45.432,,,2022-05-24 12:46:04.800000,
min,1.0,,,25.0,,,2020-01-12 00:00:00,
25%,63.25,,,35.0,,,2021-02-16 06:00:00,
50%,125.5,,,45.0,,,2022-03-23 00:00:00,
75%,187.75,,,56.0,,,2023-08-20 18:00:00,
max,250.0,,,65.0,,,2024-12-08 00:00:00,


In [28]:
print(agent_df["gender"].value_counts())

gender
Female    129
Male      121
Name: count, dtype: int64


In [29]:
"""
total agent : 250,
mostly female, 
aged 25-65, with an average of 45
"""

'\ntotal agent : 250,\nmostly female, \naged 25-65, with an average of 45\n'

### Data Visualization

In [30]:
agent_df["hire_year"] = agent_df["hire_date"].dt.year
hire_per_year = agent_df.groupby("hire_year").agg(agents_hired=("agent_id","count")).reset_index()

fig3 = px.line(hire_per_year,x="hire_year",y="agents_hired",markers=True,title="Number of Agents Hired Per Year")
fig3.show()

In [31]:
fig4 = px.pie(agent_df,names='agent_type',title='agent_type Distribution')
fig4.show()

## Owner Dataset

### Load & Preview Data


In [None]:
owner_df = pd.read_csv(r"owners.csv")
owner_df.sample(10)

Unnamed: 0,owner_id,name,gender,age,email,phone
37,38,Miguel Jones,Female,49,migueljones@gmail.com,(898)709-9848
342,343,Nicole Fitzgerald,Female,44,nicolefitzgerald@hotmail.com,001-910-712-9598x887
193,194,George Davis,Female,36,georgedavis@gmail.com,+1-505-814-6022x697
1079,1080,Donald Lopez,Female,72,donaldlopez@gmail.com,001-700-947-7817x66876
1685,1686,Kenneth Choi,Female,78,kennethchoi@hotmail.com,9138812548
1744,1745,Denise Watkins,Male,75,denisewatkins@hotmail.com,410-981-6719
986,987,Nicolas Dunn,Female,52,nicolasdunn@gmail.com,001-261-669-9940
307,308,Tina Miller,Male,75,tinamiller@yahoo.com,+1-874-576-2940x72265
472,473,Wayne Pratt,Female,45,waynepratt@yahoo.com,+1-999-684-5909x638
902,903,Luke Hendricks,Female,76,lukehendricks@gmail.com,+1-660-581-4319


In [33]:
owner_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   owner_id  1755 non-null   int64 
 1   name      1755 non-null   object
 2   gender    1755 non-null   object
 3   age       1755 non-null   int64 
 4   email     1715 non-null   object
 5   phone     1705 non-null   object
dtypes: int64(2), object(4)
memory usage: 82.4+ KB


In [34]:
owner_df.duplicated().sum()

np.int64(0)

In [35]:
"""
There are no duplicate owners, but there are missing data in  phone numbers and email addresses.
Since I won't need them for the analysis, I'll drop these columns
"""

"\nThere are no duplicate owners, but there are missing data in  phone numbers and email addresses.\nSince I won't need them for the analysis, I'll drop these columns\n"

### Data Cleaning

In [36]:
owner_df.drop(columns=["email","phone"],inplace=True)

In [37]:
owner_df.to_csv("owner_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [38]:
owner_df.describe(include="all")

Unnamed: 0,owner_id,name,gender,age
count,1755.0,1755,1755,1755.0
unique,,1740,2,
top,,Thomas Jackson,Male,
freq,,3,908,
mean,878.0,,,55.156695
std,506.769178,,,14.487283
min,1.0,,,30.0
25%,439.5,,,43.0
50%,878.0,,,56.0
75%,1316.5,,,68.0


In [39]:
"""
number of owners : 1755
mostely males,
aged between 30-80 , with an avg of 56
"""

'\nnumber of owners : 1755\nmostely males,\naged between 30-80 , with an avg of 56\n'

## Property Dataset

### Load & Preview Data


In [None]:
property_df = pd.read_csv(r"\properties.csv")
property_df.head()

Unnamed: 0,property_id,owner_id,property_manager_id,address_id,listing_date,property_type,rooms,bathrooms,parking,condition,listing_type,listing_price
0,1,1360,33,1,5/9/2021,House,5,3,3,Good,Sale,482041
1,2,657,151,2,12/18/2024,Office,3,2,3,Fair,Rent,1987
2,3,206,183,3,10/26/2021,Studio,1,1,1,Fair,Rent,1228
3,4,1220,56,4,11/1/2024,Studio,1,1,1,Fair,Sale,105380
4,5,1269,142,5,11/12/2020,Apartment,2,1,1,Good,Rent,2026


In [41]:
property_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   property_id          3000 non-null   int64 
 1   owner_id             3000 non-null   int64 
 2   property_manager_id  3000 non-null   int64 
 3   address_id           3000 non-null   int64 
 4   listing_date         3000 non-null   object
 5   property_type        3000 non-null   object
 6   rooms                3000 non-null   int64 
 7   bathrooms            3000 non-null   int64 
 8   parking              3000 non-null   int64 
 9   condition            3000 non-null   object
 10  listing_type         3000 non-null   object
 11  listing_price        3000 non-null   int64 
dtypes: int64(8), object(4)
memory usage: 281.4+ KB


In [42]:
property_df.duplicated().sum()


np.int64(0)

### Data Cleaning

In [43]:
property_df["listing_date"]=pd.to_datetime(property_df["listing_date"])

In [44]:
property_df.to_csv("properties_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [45]:
pd.set_option('display.float_format', '{:,}'.format)

In [46]:
property_df.describe(include="all")


Unnamed: 0,property_id,owner_id,property_manager_id,address_id,listing_date,property_type,rooms,bathrooms,parking,condition,listing_type,listing_price
count,3000.0,3000.0,3000.0,3000.0,3000,3000,3000.0,3000.0,3000.0,3000,3000,3000.0
unique,,,,,,5,,,,3,2,
top,,,,,,Studio,,,,Good,Sale,
freq,,,,,,618,,,,1804,1645,
mean,1500.5,874.643,128.806,1500.5,2022-06-13 07:19:40.800000,,3.6016666666666666,2.5286666666666666,1.537,,,258386.37833333333
min,1.0,1.0,8.0,1.0,2020-01-01 00:00:00,,1.0,1.0,0.0,,,977.0
25%,750.75,429.75,68.0,750.75,2021-03-18 00:00:00,,2.0,1.0,1.0,,,3887.75
50%,1500.5,880.0,137.0,1500.5,2022-05-31 12:00:00,,3.0,2.0,2.0,,,140503.0
75%,2250.25,1323.0,178.0,2250.25,2023-08-25 06:00:00,,5.0,3.0,3.0,,,323591.75
max,3000.0,1755.0,250.0,3000.0,2024-12-30 00:00:00,,8.0,6.0,3.0,,,2098160.0


In [47]:
property_df["listing_type"].value_counts()


listing_type
Sale    1645
Rent    1355
Name: count, dtype: int64

In [48]:
"""
number of listing properties : 3,000
mostely are good condition, and Studio
the  highest proportion is for sale 
prieced between 3,887 - 323,591 , with an avg of 140,503      note (if lsting type is rent  -->  price per mounth)
the  highest price is 2,098,160**
"""

'\nnumber of listing properties : 3,000\nmostely are good condition, and Studio\nthe  highest proportion is for sale \nprieced between 3,887 - 323,591 , with an avg of 140,503      note (if lsting type is rent  -->  price per mounth)\nthe  highest price is 2,098,160**\n'

### Data Visualization

In [49]:
fig4 = px.box(property_df,
             x="listing_type",
             y="listing_price",
             color="listing_type",
             title="Price Distribution by Listing Type",
             labels={"listing_type":"listing_type", "listing_type":"Listing Type"})
fig4.update_yaxes(type="log")
fig4.show()


In [50]:
# The box plot shows that Sale prices have a much wider distribution compared to Rent prices, with several high-value outliers. 
# Most of the values are small, but there are a few high-value properties are affecting the average.
#require further investigation to analyse extreme values.

### understand sale outliers

In [51]:
property_df["no_rooms"]= (property_df["rooms"]+property_df["bathrooms"]+property_df["parking"])
# فلترة على النوع "Sale"
sale_df = property_df[property_df["listing_type"] == "Sale"]
# فلترة على النوع "Rent"
rent_df = property_df[property_df["listing_type"] == "Rent"]

fig5=px.scatter(sale_df, x='no_rooms', y='listing_price', color='property_type',title="Sales Listing Price vs. Rooms by Property Type")
fig5.show()

In [52]:
fig6=px.scatter(rent_df, x='no_rooms', y='listing_price', color='property_type',title="Rent Listing Price vs. Rooms by Property Type")
fig6.show()

In [53]:
#The scatter plot shows that, according to the property type, the listing price tends to increase as the number of rooms increases.

## Maintenance Dataset

### Load & Preview Data


In [None]:
ment_df = pd.read_csv(r"maintenance.csv")
ment_df.head()

Unnamed: 0,maintenance_id,property_id,agent_id,owner_id,description,maintenance_type,cost,maintenance_date
0,1,2,151,657,Install Cleaning at property,Cleaning,1770,12/30/2024
1,2,2,151,657,Install Plumbing at property,Plumbing,1275,12/27/2024
2,3,3,183,206,Install Roof at property,Roof,1999,7/5/2022
3,4,3,183,206,Fix Painting at property,Painting,115,6/16/2023
4,5,4,56,1220,Install Electrical at property,Electrical,2266,11/20/2024


In [55]:
ment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3645 entries, 0 to 3644
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   maintenance_id    3645 non-null   int64 
 1   property_id       3645 non-null   int64 
 2   agent_id          3645 non-null   int64 
 3   owner_id          3645 non-null   int64 
 4   description       3645 non-null   object
 5   maintenance_type  3645 non-null   object
 6   cost              3645 non-null   int64 
 7   maintenance_date  3645 non-null   object
dtypes: int64(5), object(3)
memory usage: 227.9+ KB


### Data Cleaning

In [56]:
ment_df["maintenance_date"]=pd.to_datetime(ment_df["maintenance_date"])

In [57]:
ment_df.to_csv("maintenance_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [58]:
ment_df.describe(include="all")

Unnamed: 0,maintenance_id,property_id,agent_id,owner_id,description,maintenance_type,cost,maintenance_date
count,3645.0,3645.0,3645.0,3645.0,3645,3645,3645.0,3645
unique,,,,,28,7,,
top,,,,,Fix Roof at property,Roof,,
freq,,,,,147,551,,
mean,1823.0,1519.4499314128943,129.28422496570644,890.5026063100137,,,1989.4106995884772,2023-09-26 16:23:18.518518784
min,1.0,2.0,8.0,1.0,,,22.0,2020-03-20 00:00:00
25%,912.0,766.0,68.0,442.0,,,802.0,2023-01-27 00:00:00
50%,1823.0,1525.0,137.0,895.0,,,1594.0,2024-01-17 00:00:00
75%,2734.0,2333.0,183.0,1354.0,,,2355.0,2024-08-19 00:00:00
max,3645.0,2998.0,250.0,1755.0,,,6992.0,2024-12-30 00:00:00


In [59]:
cost_per_type = ment_df.groupby("maintenance_type")["cost"].sum()
print(cost_per_type)

maintenance_type
Appliances     974080
Cleaning      1026686
Electrical    1027369
HVAC          1128470
Painting       985205
Plumbing      1038963
Roof          1070629
Name: cost, dtype: int64


In [60]:
"""
number of maintenance : 3,645
mostely for Roof and costly are Cleaning  
costed between 802.- 2,355  , with an avg of 1,594 
the  highest cost is 6,992.  
"""

'\nnumber of maintenance : 3,645\nmostely for Roof and costly are Cleaning  \ncosted between 802.- 2,355  , with an avg of 1,594 \nthe  highest cost is 6,992.  \n'

### Data Visualization

In [61]:
fig7 = px.histogram(ment_df, x='maintenance_type', y='cost',title="Cost By Maintenance Type")
fig7.show()

## Visit Dataset

### Load & Preview Data

In [None]:
visit_df = pd.read_csv(r"visit.csv")
visit_df.head()

Unnamed: 0,visit_id,client_id,property_id,agent_id,visit_date,visit_purpose
0,V000132,621,2658,138,12/24/2019,Negotiation
1,V000131,621,2658,138,1/7/2020,First Viewing
2,V002284,305,2289,125,1/19/2020,Negotiation
3,V000133,621,2658,138,1/22/2020,Final Check
4,V012737,972,2101,57,2/2/2020,Area Comparison


In [63]:
visit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   visit_id       16269 non-null  object
 1   client_id      16269 non-null  int64 
 2   property_id    16269 non-null  int64 
 3   agent_id       16269 non-null  int64 
 4   visit_date     16269 non-null  object
 5   visit_purpose  16269 non-null  object
dtypes: int64(3), object(3)
memory usage: 762.7+ KB


In [64]:
visit_df.duplicated().sum()

np.int64(0)

### Data Cleaning

In [65]:
visit_df["visit_date"]=pd.to_datetime(visit_df["visit_date"])

In [66]:
visit_df.to_csv("visit_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [67]:
visit_df.describe(include="all")

Unnamed: 0,visit_id,client_id,property_id,agent_id,visit_date,visit_purpose
count,16269,16269.0,16269.0,16269.0,16269,16269
unique,16269,,,,,9
top,V011469,,,,,Negotiation
freq,1,,,,,3000
mean,,1039.1576003442128,1503.1386686335975,126.34070932448216,2023-03-11 16:38:30.160428032,
min,,2.0,1.0,1.0,2019-12-24 00:00:00,
25%,,538.0,747.0,62.0,2022-03-10 00:00:00,
50%,,1035.0,1516.0,127.0,2023-06-29 00:00:00,
75%,,1545.0,2250.0,192.0,2024-05-22 00:00:00,
max,,2060.0,3000.0,249.0,2024-12-30 00:00:00,


In [None]:
"""
number of visits : 16269
mostly reason is for Negotiation 
recently are in 2024-12-30  
"""

### Data Visualization

In [69]:
top_properties = (
    visit_df.groupby("property_id").agg(no_visits =("visit_id","count")).reset_index()
    .sort_values(by="no_visits", ascending=False)
    .head(10)
)

fig8 = px.bar(
    top_properties,
    x="property_id",
    y="no_visits",
    title="Top 10 Properties by No. Visits"
)

fig8.show()

In [70]:
visit_df['visit_year'] = visit_df['visit_date'].dt.year
visit_per_year = visit_df.groupby(['visit_year']).agg(no_visits=('visit_id','count')).reset_index()
fig9 = px.line(visit_per_year,x="visit_year",y='no_visits',markers=True,title=" No visits per year",
)
fig9.show()

In [71]:
visit_df['visit_year'] = visit_df['visit_date'].dt.year
visit_df['visit_month'] = visit_df['visit_date'].dt.month
visit_per_year = visit_df.groupby(['visit_year','visit_month']).agg(no_visits=('visit_id','count')).reset_index()
visit_per_year['no_visits_smooth'] = visit_per_year.groupby('visit_year')['no_visits'].transform(lambda x: x.rolling(window=4, min_periods=1).mean()) #Moving Average

fig10 = px.line(visit_per_year,x="visit_month",y='no_visits_smooth',color="visit_year",markers=True,title=" Smoothed No visits per month per year",
)
fig10.show()

## Sales Dataset

### Load & Preview Data


In [None]:
sales_df = pd.read_csv(r"sales.csv")
sales_df.head()

Unnamed: 0,sale_id,property_id,client_id,agent_id,owner_id,sale_date,sale_amount,commission_rate,commission_amount,commission_payment_method,commission_payment_date
0,1,662,679,107,1745,5/16/2023,886392.38,0.05,44319.62,Cash,9/5/2023
1,2,1422,2057,207,548,4/7/2022,237485.78,0.02,4749.72,Bank Transfer,4/25/2024
2,3,873,201,86,515,4/19/2023,235812.08,0.05,11790.6,Cash,7/4/2024
3,4,552,573,51,373,10/2/2024,266874.04,0.03,8006.22,Cash,10/30/2024
4,5,2020,1062,91,929,11/18/2023,156286.92,0.02,3125.74,Bank Transfer,2/24/2024


In [73]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1645 entries, 0 to 1644
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   sale_id                    1645 non-null   int64  
 1   property_id                1645 non-null   int64  
 2   client_id                  1645 non-null   int64  
 3   agent_id                   1645 non-null   int64  
 4   owner_id                   1645 non-null   int64  
 5   sale_date                  1645 non-null   object 
 6   sale_amount                1645 non-null   float64
 7   commission_rate            1645 non-null   float64
 8   commission_amount          1641 non-null   float64
 9   commission_payment_method  1645 non-null   object 
 10  commission_payment_date    1645 non-null   object 
dtypes: float64(3), int64(5), object(3)
memory usage: 141.5+ KB


In [74]:
sales_df.duplicated().sum()

np.int64(0)

In [75]:
"""
 fill null in commission_amount which are drivied from commission_rate * sale_amount.
 change datatypes.
"""

'\n fill null in commission_amount which are drivied from commission_rate * sale_amount.\n change datatypes.\n'

### Data Cleaning


In [76]:
sales_df["sale_date"]=pd.to_datetime(sales_df["sale_date"])
sales_df["commission_payment_date"]=pd.to_datetime(sales_df["commission_payment_date"])

In [77]:
sales_df["commission_amount"].fillna((sales_df["sale_amount"]*sales_df["commission_rate"]),inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [78]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1645 entries, 0 to 1644
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   sale_id                    1645 non-null   int64         
 1   property_id                1645 non-null   int64         
 2   client_id                  1645 non-null   int64         
 3   agent_id                   1645 non-null   int64         
 4   owner_id                   1645 non-null   int64         
 5   sale_date                  1645 non-null   datetime64[ns]
 6   sale_amount                1645 non-null   float64       
 7   commission_rate            1645 non-null   float64       
 8   commission_amount          1645 non-null   float64       
 9   commission_payment_method  1645 non-null   object        
 10  commission_payment_date    1645 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(5), object(1)
memory usage: 1

In [79]:
sales_df.to_csv("sales_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [80]:
sales_df.describe(include="all")

Unnamed: 0,sale_id,property_id,client_id,agent_id,owner_id,sale_date,sale_amount,commission_rate,commission_amount,commission_payment_method,commission_payment_date
count,1645.0,1645.0,1645.0,1645.0,1645.0,1645,1645.0,1645.0,1645.0,1645,1645
unique,,,,,,,,,,3,
top,,,,,,,,,,Bank Transfer,
freq,,,,,,,,,,570,
mean,823.0,1505.5410334346504,1034.0784194528876,133.609726443769,883.8273556231003,2023-10-01 08:50:28.814589696,466658.2416838905,0.0328693009118541,15554.576782917931,,2024-05-22 00:06:07.659574528
min,1.0,1.0,2.0,1.0,1.0,2020-02-02 00:00:00,76107.66,0.02,1574.02,,2020-07-13 00:00:00
25%,412.0,750.0,525.0,69.0,422.0,2023-02-13 00:00:00,193961.93,0.02,5479.79,,2024-02-21 00:00:00
50%,823.0,1511.0,1035.0,134.0,899.0,2024-01-25 00:00:00,286719.09,0.03,9629.85,,2024-09-02 00:00:00
75%,1234.0,2279.0,1550.0,207.0,1341.0,2024-08-21 00:00:00,635458.41,0.05,20067.99,,2024-11-22 00:00:00
max,1645.0,3000.0,2059.0,249.0,1755.0,2024-12-30 00:00:00,2186056.02,0.05,98085.36,,2024-12-30 00:00:00


In [81]:
sales_df["commission_payment_method"].value_counts()

commission_payment_method
Bank Transfer    570
Cash             539
Check            536
Name: count, dtype: int64

In [82]:
"""
1,645 sales from 2020 to 2024 with avg price 286,719.092 and avg commission rate 0.03
mostely payed through Bank Transfer  

"""

'\n1,645 sales from 2020 to 2024 with avg price 286,719.092 and avg commission rate 0.03\nmostely payed through Bank Transfer  \n\n'

## Rent & Contracts Dataset

### Load & Preview Rent Data

In [None]:
rent_df= pd.read_csv(r"rents.csv")
rent_df.head()

Unnamed: 0,rent_id,property_id,client_id,agent_id,owner_id,rent_amount,commission_rate,commission_amount,commission_payment_method,commission_payment_date
0,1,107,794,240,1020,4230.0,0.1,10575.0,Credit Card,4/25/2024
1,2,2402,1075,84,755,2640.0,0.1,3960.0,Cash,11/27/2024
2,3,1112,1043,36,552,5120.0,0.05,2560.0,Credit Card,11/24/2024
3,4,2439,62,44,869,1660.0,0.05,1660.0,Cash,3/6/2024
4,5,2007,1582,162,1422,3980.0,0.1,2388.0,Bank Transfer,9/4/2024


In [84]:
rent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   rent_id                    1355 non-null   int64  
 1   property_id                1355 non-null   int64  
 2   client_id                  1355 non-null   int64  
 3   agent_id                   1355 non-null   int64  
 4   owner_id                   1355 non-null   int64  
 5   rent_amount                1345 non-null   float64
 6   commission_rate            1355 non-null   float64
 7   commission_amount          1355 non-null   float64
 8   commission_payment_method  1355 non-null   object 
 9   commission_payment_date    1355 non-null   object 
dtypes: float64(3), int64(5), object(2)
memory usage: 106.0+ KB


In [85]:
rent_df.duplicated().sum()

np.int64(0)

In [86]:
"""
 fill null in rent_amount which are drivied from commission_amount \ commission_rate * no month
 change datatypes.
"""


invalid escape sequence '\ '


invalid escape sequence '\ '


invalid escape sequence '\ '



'\n fill null in rent_amount which are drivied from commission_amount \\ commission_rate * no month\n change datatypes.\n'

### Load & Preview Contracts  Data

In [None]:
cnt_df = pd.read_csv(r"rent_contracts.csv")
cnt_df.head()

Unnamed: 0,rent_id,agreement_date,rent_start_date,rent_end_date,terms
0,1,3/1/2023,3/21/2023,4/9/2025,Lease agreement for Office in TX for 25 months...
1,2,11/21/2024,11/24/2024,2/17/2026,Lease agreement for Apartment in FL for 15 mon...
2,3,3/19/2023,3/21/2023,1/15/2024,Lease agreement for House in TX for 10 months....
3,4,4/9/2021,4/28/2021,12/19/2022,Lease agreement for Apartment in TX for 20 mon...
4,5,12/25/2021,12/31/2021,6/29/2022,Lease agreement for Office in CA for 6 months....


In [88]:
cnt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rent_id          1355 non-null   int64 
 1   agreement_date   1355 non-null   object
 2   rent_start_date  1355 non-null   object
 3   rent_end_date    1355 non-null   object
 4   terms            1355 non-null   object
dtypes: int64(1), object(4)
memory usage: 53.1+ KB


In [89]:
cnt_df.duplicated().sum()

np.int64(0)

In [90]:
"""
 change datatypes 
"""

'\n change datatypes \n'

### Data Cleaning

In [91]:
rent_df["commission_payment_date"]=pd.to_datetime(rent_df["commission_payment_date"])

In [92]:
cnt_df["rent_start_date"]=pd.to_datetime(cnt_df["rent_start_date"])
cnt_df["rent_end_date"]=pd.to_datetime(cnt_df["rent_end_date"])


In [93]:
Rent_Transaction_df = rent_df.merge(cnt_df, on="rent_id", how="left")
Rent_Transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355 entries, 0 to 1354
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   rent_id                    1355 non-null   int64         
 1   property_id                1355 non-null   int64         
 2   client_id                  1355 non-null   int64         
 3   agent_id                   1355 non-null   int64         
 4   owner_id                   1355 non-null   int64         
 5   rent_amount                1345 non-null   float64       
 6   commission_rate            1355 non-null   float64       
 7   commission_amount          1355 non-null   float64       
 8   commission_payment_method  1355 non-null   object        
 9   commission_payment_date    1355 non-null   datetime64[ns]
 10  agreement_date             1355 non-null   object        
 11  rent_start_date            1355 non-null   datetime64[ns]
 12  rent_e

In [94]:
Rent_Transaction_df["rent_months"] = (
                                     ((Rent_Transaction_df["rent_end_date"].dt.year - Rent_Transaction_df["rent_start_date"].dt.year ) * 12) 
                                        + (Rent_Transaction_df["rent_end_date"].dt.month  - Rent_Transaction_df["rent_start_date"].dt.month)
                                       + np.where(
                                             Rent_Transaction_df["rent_end_date"].dt.day >= Rent_Transaction_df["rent_start_date"].dt.day,
                                             1,
                                             0
                                       )
                                   )

In [95]:
Rent_Transaction_df["rent_amount"].fillna((Rent_Transaction_df["commission_amount"]/(Rent_Transaction_df["commission_rate"] * Rent_Transaction_df["rent_months"])),inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [96]:
Rent_Transaction_df.to_csv("rent_clean.csv",index=False)

### Descriptive Statistics & Group Insights

In [97]:
Rent_Transaction_df.describe(include="all")

Unnamed: 0,rent_id,property_id,client_id,agent_id,owner_id,rent_amount,commission_rate,commission_amount,commission_payment_method,commission_payment_date,agreement_date,rent_start_date,rent_end_date,terms,rent_months
count,1355.0,1355.0,1355.0,1355.0,1355.0,1355.0,1355.0,1355.0,1355,1355,1355,1355,1355,1355,1355.0
unique,,,,,,,,,3,,785,,,933,
top,,,,,,,,,Cash,,12/15/2024,,,Lease agreement for Office in NY for 32 months...,
freq,,,,,,,,,456,,8,,,5,
mean,678.0,1494.380073800738,1058.0369003690037,119.64501845018452,863.4929889298893,4600.952029520296,0.0756457564575645,7435.212177121771,,2024-04-26 04:18:14.612546048,,2023-10-07 13:19:10.405904128,2025-07-06 12:04:46.937269248,,21.26494464944649
min,1.0,2.0,5.0,2.0,1.0,650.0,0.05,357.0,,2020-06-16 00:00:00,,2020-04-13 00:00:00,2021-06-06 00:00:00,,6.0
25%,339.5,758.5,570.5,57.0,435.5,2180.0,0.05,2609.5,,2023-12-27 12:00:00,,2023-01-24 00:00:00,2024-08-19 00:00:00,,14.0
50%,678.0,1488.0,1074.0,120.0,864.0,3660.0,0.1,4965.0,,2024-08-18 00:00:00,,2024-01-18 00:00:00,2025-09-06 00:00:00,,21.0
75%,1016.5,2216.5,1556.0,182.0,1309.0,6180.0,0.1,9429.0,,2024-11-19 00:00:00,,2024-09-13 00:00:00,2026-07-16 12:00:00,,29.0
max,1355.0,2999.0,2060.0,248.0,1754.0,21740.0,0.1,55200.0,,2024-12-30 00:00:00,,2025-01-28 00:00:00,2028-01-01 00:00:00,,36.0
