In [1]:
# Dependencies
import requests
import pandas as pd
from census import Census

In [2]:
# File paths
path = "../Sales_info_with_taxes.csv"
study_results_path = "output_data/census.csv"

# Read the mouse data and the study results
sales_data = pd.read_csv(path)
census_data = pd.read_csv(study_results_path)

In [3]:
# Rename ZIP column in sales_data
sales_data = sales_data.rename(
    columns = {
        "ZIP": "Zipcode",
    }
)

In [4]:
# Combine the data into a single DataFrame
complete_df = pd.merge(sales_data, census_data, how="left", on=("Zipcode"))
# drop a column
complete_df.drop(columns='Unnamed: 0', inplace=True) 
complete_df.head()

Unnamed: 0,Address,Count,Zipcode,Type,Owner Occ?,Purchase Amt,Amount Mortgaged,% Borrowed,Mtg Tx Pd,Mansion Tax Pd,New Sales Tax Amount,Population,Median Age,Household Income,Per Capita Income,Median Gross Rent,Poverty Count,Poverty Rate
0,1995 ANTHONY AVE,1,10457.0,SFR,0.0,2450000.0,2350000.0,95.918367,45825.0,24500.0,22050.0,79817.0,30.8,41145.0,20187.0,1381.0,27317.0,34.224539
1,146 E 176TH ST,1,10453.0,SFR,0.0,490000.0,0.0,0.0,0.0,0.0,6860.0,80385.0,33.8,34800.0,19016.0,1362.0,27796.0,34.578591
2,412 E 179TH ST,1,10457.0,SFR,0.0,355000.0,324328.0,91.36,5837.904,0.0,4970.0,79817.0,30.8,41145.0,20187.0,1381.0,27317.0,34.224539
3,2075 BATHGATE AVE,1,10457.0,SFR,0.0,420000.0,408000.0,97.142857,7344.0,0.0,5880.0,79817.0,30.8,41145.0,20187.0,1381.0,27317.0,34.224539
4,482 E 181ST ST,1,10457.0,SFR,0.0,365100.0,282000.0,77.239113,5076.0,0.0,5111.4,79817.0,30.8,41145.0,20187.0,1381.0,27317.0,34.224539


In [5]:
# prints the size of the dataframe
print('The size of the data frame is:',complete_df.shape)

The size of the data frame is: (2552, 18)


In [6]:
len(complete_df['Zipcode'].unique())

8

In [7]:
# Verify if the data contains missing values
missing_data=complete_df.isnull()
for column in missing_data.columns:
    print(column)
    print(missing_data[column].value_counts()) 
    print("")

Address
Address
False    2552
Name: count, dtype: int64

Count
Count
False    2552
Name: count, dtype: int64

Zipcode
Zipcode
False    2552
Name: count, dtype: int64

Type
Type
False    2552
Name: count, dtype: int64

Owner Occ?
Owner Occ?
False    2552
Name: count, dtype: int64

Purchase Amt
Purchase Amt
False    2552
Name: count, dtype: int64

Amount Mortgaged
Amount Mortgaged
False    2552
Name: count, dtype: int64

% Borrowed
% Borrowed
False    2552
Name: count, dtype: int64

Mtg Tx Pd
Mtg Tx Pd
False    2552
Name: count, dtype: int64

Mansion Tax Pd
Mansion Tax Pd
False    2552
Name: count, dtype: int64

New Sales Tax Amount
New Sales Tax Amount
False    2552
Name: count, dtype: int64

Population
Population
False    2552
Name: count, dtype: int64

Median Age
Median Age
False    2552
Name: count, dtype: int64

Household Income
Household Income
False    2552
Name: count, dtype: int64

Per Capita Income
Per Capita Income
False    2552
Name: count, dtype: int64

Median Gross Rent
Med

In [8]:
# prints rows with missing values
complete_df[complete_df.isnull().any(axis=1)]

Unnamed: 0,Address,Count,Zipcode,Type,Owner Occ?,Purchase Amt,Amount Mortgaged,% Borrowed,Mtg Tx Pd,Mansion Tax Pd,New Sales Tax Amount,Population,Median Age,Household Income,Per Capita Income,Median Gross Rent,Poverty Count,Poverty Rate


In [9]:
# # drop rows with Nan values
# complete_df.drop(index=[2446,2447,2448,2449,2450,2451,2452,2453,2454,2455,2456,2457,
# 2458,2459,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,7373,
# 8006,8026,10018,10019,10020,10021,10022,10023,10024,10025,10026,10027,10028,10029,10030,
# 10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10043,], inplace=True)

In [10]:
# print number of missing data in the dataset
print('The number of missing data in the dataset is:',complete_df.isnull().sum().sum())

The number of missing data in the dataset is: 0


In [11]:
# check datatypes
complete_df.dtypes

Address                  object
Count                     int64
Zipcode                 float64
Type                     object
Owner Occ?              float64
Purchase Amt            float64
Amount Mortgaged        float64
% Borrowed              float64
Mtg Tx Pd               float64
Mansion Tax Pd          float64
New Sales Tax Amount    float64
Population              float64
Median Age              float64
Household Income        float64
Per Capita Income       float64
Median Gross Rent       float64
Poverty Count           float64
Poverty Rate            float64
dtype: object

In [12]:
# change datatype for zipcode
complete_df['Zipcode']=complete_df['Zipcode'].astype('int64')

In [13]:
# reset indeces
complete_df.reset_index(drop=True, inplace=True)

In [14]:
# Export the City_Data into a csv
complete_df.to_csv("../complete.csv", encoding="utf-8", index=False)

In [15]:
# # Useful code

# # Data cleaning

# # prints the size of the dataframe
# print('The size of the data frame is:',census_df.shape) 

# # drop a certain column
# census_df.drop(columns='Unnamed: 0', inplace=True) 

# census_df['electric_eui'].replace(np.nan, census_df['Median Age'].mean(), inplace=True)

# # Find number of missing values for each column
# missing_data=census_pd.isnull()
# for column in missing_data.columns:
#     print(column)
#     print(missing_data[column].value_counts()) 
#     print("")

# # prints rows with missing values
# census_pd[census_pd.isnull().any(axis=1)]

# # reset indeces
# census_pd.reset_index(drop=True, inplace=True)

# # print datatypes
# census_pd.dtypes

# # change datatype
# census_pd['Median Age']=census_pd['Median Age'].astype('int')

# # Exploratory Data Analysis

# # .describe
# census_pd[['columns']].describe()