In [1]:
# Dependencies
import requests
import pandas as pd
from census import Census

In [2]:
# File paths
path = "../Sales_info_with_taxes.csv"
study_results_path = "output_data/census.csv"

# Read the mouse data and the study results
sales_data = pd.read_csv(path)
census_data = pd.read_csv(study_results_path)

In [3]:
# Rename ZIP column in sales_data
sales_data = sales_data.rename(
    columns = {
        "ZIP": "Zipcode",
    }
)

In [4]:
# Combine the data into a single DataFrame
complete_df = pd.merge(sales_data, census_data, how="left", on=("Zipcode"))
# drop a column
complete_df.drop(columns='Unnamed: 0', inplace=True) 
complete_df.head()

Unnamed: 0,Count,Zipcode,Type,Owner Occ?,Purchase Down %,Purchase Amt,Amount Mortgaged,Mtg Tx Pd,Mansion Tax Pd,New Sales Tax Amount,Population,Median Age,Household Income,Per Capita Income,Median Gross Rent,Poverty Count,Poverty Rate
0,1,11201.0,CND,0.0,33.0,4600000.0,3062500.0,59718.75,46000.0,41400.0,69755.0,35.1,163310.0,108933.0,3110.0,7004.0,10.040857
1,1,11201.0,CND,0.0,30.0,4550000.0,3185000.0,62107.5,45500.0,40950.0,69755.0,35.1,163310.0,108933.0,3110.0,7004.0,10.040857
2,1,11201.0,CND,0.0,69.0,2620000.0,825084.0,16089.138,26200.0,23580.0,69755.0,35.1,163310.0,108933.0,3110.0,7004.0,10.040857
3,1,11201.0,CND,0.0,100.0,4740000.0,0.0,0.0,47400.0,42660.0,69755.0,35.1,163310.0,108933.0,3110.0,7004.0,10.040857
4,1,11201.0,CND,0.0,81.0,4200000.0,787743.0,15360.9885,42000.0,37800.0,69755.0,35.1,163310.0,108933.0,3110.0,7004.0,10.040857


In [5]:
# prints the size of the dataframe
print('The size of the data frame is:',complete_df.shape)

The size of the data frame is: (17758, 17)


In [6]:
len(complete_df['Zipcode'].unique())

47

In [7]:
# Verify if the data contains missing values
missing_data=complete_df.isnull()
for column in missing_data.columns:
    print(column)
    print(missing_data[column].value_counts()) 
    print("")

Count
Count
False    17758
Name: count, dtype: int64

Zipcode
Zipcode
False    17758
Name: count, dtype: int64

Type
Type
False    17758
Name: count, dtype: int64

Owner Occ?
Owner Occ?
False    17758
Name: count, dtype: int64

Purchase Down %
Purchase Down %
False    17758
Name: count, dtype: int64

Purchase Amt
Purchase Amt
False    17758
Name: count, dtype: int64

Amount Mortgaged
Amount Mortgaged
False    17758
Name: count, dtype: int64

Mtg Tx Pd
Mtg Tx Pd
False    17758
Name: count, dtype: int64

Mansion Tax Pd
Mansion Tax Pd
False    17758
Name: count, dtype: int64

New Sales Tax Amount
New Sales Tax Amount
False    17758
Name: count, dtype: int64

Population
Population
False    17701
True        57
Name: count, dtype: int64

Median Age
Median Age
False    17701
True        57
Name: count, dtype: int64

Household Income
Household Income
False    17701
True        57
Name: count, dtype: int64

Per Capita Income
Per Capita Income
False    17701
True        57
Name: count, dtype: i

In [8]:
# prints rows with missing values
complete_df[complete_df.isnull().any(axis=1)]

Unnamed: 0,Count,Zipcode,Type,Owner Occ?,Purchase Down %,Purchase Amt,Amount Mortgaged,Mtg Tx Pd,Mansion Tax Pd,New Sales Tax Amount,Population,Median Age,Household Income,Per Capita Income,Median Gross Rent,Poverty Count,Poverty Rate
2446,1,11243.0,CND,0.0,23.0,520000.0,400000.0,7200.0,0.0,7280.0,,,,,,,
2447,1,11243.0,CND,0.0,100.0,1250000.0,1299556.0,25341.342,12500.0,11250.0,,,,,,,
2448,1,11243.0,CND,0.0,35.0,1668500.0,0.0,0.0,16685.0,15016.5,,,,,,,
2449,1,11243.0,CND,0.0,100.0,1125000.0,0.0,0.0,11250.0,10125.0,,,,,,,
2450,1,11243.0,CND,0.0,81.0,995000.0,184239.0,3316.302,0.0,13930.0,,,,,,,
2451,1,11243.0,CND,0.0,100.0,1750000.0,0.0,0.0,17500.0,15750.0,,,,,,,
2452,1,11243.0,CND,0.0,100.0,590000.0,0.0,0.0,0.0,8260.0,,,,,,,
2453,1,11243.0,CND,0.0,30.0,550000.0,0.0,0.0,0.0,7700.0,,,,,,,
2454,1,11243.0,CND,0.0,95.0,14250000.0,750000.0,14625.0,142500.0,128250.0,,,,,,,
2455,1,11243.0,CND,0.0,88.0,1695000.0,200000.0,3600.0,16950.0,15255.0,,,,,,,


In [9]:
# drop rows with Nan values
complete_df.drop(index=[2446,2447,2448,2449,2450,2451,2452,2453,2454,2455,2456,2457,
2458,2459,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,7373,
8006,8026,10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,10030,
10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10043,], inplace=True)

In [10]:
# print number of missing data in the dataset
print('The number of missing data in the dataset is:',complete_df.isnull().sum().sum())

The number of missing data in the dataset is: 0


In [11]:
# check datatypes
complete_df.dtypes

Count                     int64
Zipcode                 float64
Type                     object
Owner Occ?              float64
Purchase Down %         float64
Purchase Amt            float64
Amount Mortgaged        float64
Mtg Tx Pd               float64
Mansion Tax Pd          float64
New Sales Tax Amount    float64
Population              float64
Median Age              float64
Household Income        float64
Per Capita Income       float64
Median Gross Rent       float64
Poverty Count           float64
Poverty Rate            float64
dtype: object

In [12]:
# change datatype for zipcode
complete_df['Zipcode']=complete_df['Zipcode'].astype('int64')

In [13]:
# reset indeces
complete_df.reset_index(drop=True, inplace=True)

In [14]:
# Export the City_Data into a csv
complete_df.to_csv("../complete.csv", encoding="utf-8", index=False)

In [15]:
# # Useful code

# # Data cleaning

# # prints the size of the dataframe
# print('The size of the data frame is:',census_df.shape) 

# # drop a certain column
# census_df.drop(columns='Unnamed: 0', inplace=True) 

# census_df['electric_eui'].replace(np.nan, census_df['Median Age'].mean(), inplace=True)

# # Find number of missing values for each column
# missing_data=census_pd.isnull()
# for column in missing_data.columns:
#     print(column)
#     print(missing_data[column].value_counts()) 
#     print("")

# # prints rows with missing values
# census_pd[census_pd.isnull().any(axis=1)]

# # reset indeces
# census_pd.reset_index(drop=True, inplace=True)

# # print datatypes
# census_pd.dtypes

# # change datatype
# census_pd['Median Age']=census_pd['Median Age'].astype('int')

# # Exploratory Data Analysis

# # .describe
# census_pd[['columns']].describe()