In [115]:
import pandas as pd

In [116]:
df = pd.read_csv("./kc_house_data.csv")

## Part 1


##### Numerical and categorical variables are fundamental concepts in data analysis. Numerical variables represent quantities that can be measured or counted, while categorical variables represent categories or groups. However, there are situations where numerical variables can also be treated as categorical.

#### Numerical Variables & Categorical Variables

Numerical variables have an order and mathmetical operations can be performed.
Ex: Age, height, weight. Where we can calculate the mean, median...

Categorical Variables are variables that represent a certain state. And in some cases they are numbered but they are not considered numerical. 
Ex: Ratings, Time Periods, Postal codes

#### Main cases where Categorical seems Numerical:

1. Binning: This where we group numerical data into intervals.
Ex: Converting age into intervals
2. Ratings: Movie ratings, product reviews, they are numbers representing a specific meaning
3. Codes: Postal Codes, Product Codes
4. Times: Years, Months.

### Examples from our Dataset

In [117]:
df[["grade","condition"]]


Unnamed: 0,grade,condition
0,7,3
1,7,3
2,6,3
3,7,5
4,8,3
...,...,...
21608,8,3
21609,8,3
21610,7,3
21611,8,3


In [118]:
df["condition"].max()

5

#### Here we can see that both the grade conditions represnt categorical value represented in a numerical form. The conditon most probably means as the max is 5 "Excellent" where a condition of 1 "Poor" where the condition is poor.

### Another example here we can take is the waterfront column

In [119]:
df[["waterfront"]].max()

waterfront    1
dtype: int64

#### In the waterfront column we have values of zeroes and ones representing that if there is a waterfront view or no waterfront view

## Part 2

#### Introducing Missing Values

In [120]:
import numpy as np
missing_percentage = 0.2 

In [121]:
list_cols = ['price', 'bedrooms', 'bathrooms', 'floors'] 
df_ = df[list_cols].copy()

In [122]:
mask = np.random.rand(*df_.shape) < missing_percentage
df_[mask] = np.nan

In [123]:
df_.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      17275 non-null  float64
 1   bedrooms   17252 non-null  float64
 2   bathrooms  17256 non-null  float64
 3   floors     17376 non-null  float64
dtypes: float64(4)
memory usage: 675.5 KB


In [124]:
df_.head()

Unnamed: 0,price,bedrooms,bathrooms,floors
0,221900.0,,1.0,1.0
1,538000.0,3.0,,2.0
2,180000.0,2.0,1.0,
3,604000.0,,3.0,1.0
4,,3.0,2.0,1.0


#### Removing All Missing Values

In [125]:
df_cleaned = df_.dropna()

In [126]:
df_cleaned.head()

Unnamed: 0,price,bedrooms,bathrooms,floors
6,257500.0,3.0,2.25,2.0
7,291850.0,3.0,1.5,1.0
9,323000.0,3.0,2.5,2.0
13,400000.0,3.0,1.75,1.0
15,650000.0,4.0,3.0,2.0


#### Removing Missing Values for Bathrooms

In [127]:
df_cleaned_bath = df_.dropna(subset=["bathrooms"])
df_cleaned_bath.head(20)

Unnamed: 0,price,bedrooms,bathrooms,floors
0,221900.0,,1.0,1.0
2,180000.0,2.0,1.0,
3,604000.0,,3.0,1.0
4,,3.0,2.0,1.0
5,,4.0,4.5,1.0
6,257500.0,3.0,2.25,2.0
7,291850.0,3.0,1.5,1.0
8,229500.0,3.0,1.0,
9,323000.0,3.0,2.5,2.0
10,662500.0,,2.5,1.0


#### Getting Median for Price and replacing the missing values with it

In [128]:
p_median = df_["price"].median()
df_fill_price= df_.copy()

In [129]:
df_fill_price["price"] = df_["price"].fillna(df_["price"].median())

In [130]:
df_fill_price.head(10)

Unnamed: 0,price,bedrooms,bathrooms,floors
0,221900.0,,1.0,1.0
1,538000.0,3.0,,2.0
2,180000.0,2.0,1.0,
3,604000.0,,3.0,1.0
4,450000.0,3.0,2.0,1.0
5,450000.0,4.0,4.5,1.0
6,257500.0,3.0,2.25,2.0
7,291850.0,3.0,1.5,1.0
8,229500.0,3.0,1.0,
9,323000.0,3.0,2.5,2.0


#### Dropping Floors Column

In [131]:
df_fill_price.drop(axis=1,inplace= True,columns= ["floors"])

In [132]:
df_fill_price

Unnamed: 0,price,bedrooms,bathrooms
0,221900.0,,1.00
1,538000.0,3.0,
2,180000.0,2.0,1.00
3,604000.0,,3.00
4,450000.0,3.0,2.00
...,...,...,...
21608,360000.0,3.0,2.50
21609,400000.0,4.0,2.50
21610,450000.0,2.0,0.75
21611,400000.0,3.0,2.50


#### Dropping NAN in Bedrooms and Bathrooms Columns

In [133]:
df_drop = df_fill_price.copy()
df_drop.dropna(inplace=True, subset=["bathrooms","bedrooms"])
df_drop

Unnamed: 0,price,bedrooms,bathrooms
2,180000.0,2.0,1.00
4,450000.0,3.0,2.00
5,450000.0,4.0,4.50
6,257500.0,3.0,2.25
7,291850.0,3.0,1.50
...,...,...,...
21608,360000.0,3.0,2.50
21609,400000.0,4.0,2.50
21610,450000.0,2.0,0.75
21611,400000.0,3.0,2.50


#### Dropping Rows

In [134]:
condition = df_drop["bedrooms"] >3
df_drop = df_drop[condition]
df_drop

Unnamed: 0,price,bedrooms,bathrooms
5,450000.0,4.0,4.50
15,650000.0,4.0,3.00
20,450000.0,4.0,1.75
22,450000.0,5.0,2.50
34,322500.0,4.0,2.75
...,...,...,...
21596,450000.0,5.0,2.75
21597,1580000.0,4.0,3.25
21598,450000.0,4.0,2.50
21606,1010000.0,4.0,3.50
