In [218]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import fuzzywuzzy for text matching
from fuzzywuzzy import fuzz, process

# Import machine learning libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,r2_score
from scipy import stats

In [219]:
df1 = pd.read_csv("data/Bengaluru_House_Data.csv")
df1.head(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.0


In [220]:
df1['location'].value_counts()

location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64

In [221]:
# df1.shape
df2 = df1.drop(['society', 'area_type','availability','balcony'],axis=1)
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [222]:
# df2.isnull().sum() 
df2['location']=df2['location'].fillna('Sarjapur Road')

In [223]:
df2['size']=df2['size'].fillna('2 BHK')

In [224]:
df2.isnull().sum() 

location       0
size           0
total_sqft     0
bath          73
price          0
dtype: int64

In [225]:
# Extract number of bedrooms from size
df2['bhk'] = df2['size'].apply(lambda x: int(str(x).split(' ')[0]) if isinstance(x, str) else None)
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [226]:
df2['bath'] = df2.apply(lambda row: row['bhk'] if pd.isnull(row['bath']) else row['bath'], axis=1)
df2.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [227]:
df2['total_sqft'].unique()

# Extract unique values
unique_values = df2['total_sqft'].unique()

# Categorize values
floats = []
ranges = []
strings = []
unknown = []

for value in unique_values:
    try:
        # Try to convert to float
        float_value = float(value)
        floats.append(value)
    except ValueError:
        # Check if it is a range
        if '-' in value and all(part.strip().replace('.', '', 1).isdigit() for part in value.split('-')):
            ranges.append(value)
        else:
            try:
                # Check if it's a valid range format (e.g., '1133 - 1384')
                start, end = value.split('-')
                float(start.strip())
                float(end.strip())
                ranges.append(value)
            except (ValueError, IndexError):
                # If it doesn't fit float or range, classify it as string or unknown
                if value.isalpha() or not any(char.isdigit() for char in value):
                    strings.append(value)
                else:
                    unknown.append(value)

# Display categorized results
print("Floats:")
print(floats)
print("\nRanges:")
print(ranges)
print("\nStrings:")
print(strings)
print("\nUnknown:")
print(unknown)

Floats:
['1056', '2600', '1440', '1521', '1200', '1170', '2732', '3300', '1310', '1020', '1800', '2785', '1000', '1100', '2250', '1175', '1180', '1540', '2770', '600', '1755', '2800', '1767', '510', '1250', '660', '1610', '1151', '1025', '1075', '1760', '1693', '1925', '700', '1070', '1724', '1290', '1143', '1296', '1254', '1330.74', '970', '1459', '800', '869', '1270', '1670', '2010', '1185', '1600', '1500', '1407', '840', '4395', '845', '5700', '1160', '3000', '1140', '1220', '1350', '1005', '500', '1358', '1569', '1240', '2089', '1206', '1150', '2511', '460', '4400', '1660', '1326', '1325', '1499', '1665', '708', '1060', '710', '1450', '2894', '1330', '2502', '650', '2400', '1007', '966', '1630', '1640', '782', '1260', '1413', '1116', '1530', '3700', '2497', '1436', '276', '1427', '2061', '2650', '1282', '1050', '945', '950', '1870', '880', '1535', '1360', '1280', '5000', '3050', '1563.05', '1167', '4000', '1828', '890', '1612', '1034', '1710', '957', '2795', '1125', '1735', '2050',

In [228]:
def convert_sqft_to_num(x):
    # Dictionary mapping units to their conversion factors to square feet
    conversion_factors = {
        'Sq. Meter': 10.7639,
        'Sq. Yard': 9,
        'Cent': 435.6,
        'Acre': 43560
    }

    # Check for unit and convert
    for unit, factor in conversion_factors.items():
        if unit in x:
            number = float(x.split(unit)[0].strip())
            return round(number * factor, 2)

    # Handle ranges of values
    tokens = x.split('-')
    if len(tokens) == 2:
        return round((float(tokens[0]) + float(tokens[1])) / 2, 2)

    # Handle direct conversion to float
    try:
        return round(float(x), 2)
    except ValueError:
        return None  # Return None if conversion fails

In [229]:
df2['total_sqft'] = df2['total_sqft'].apply(convert_sqft_to_num)
df2.head(20)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2
5,Whitefield,2 BHK,1170.0,2.0,38.0,2
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3
9,Gandhi Bazar,6 Bedroom,1020.0,6.0,370.0,6


In [230]:
df3 = df2[df2.bhk < 10]
df3['bhk'].value_counts()

bhk
2    5544
3    4857
4    1417
1     656
5     356
6     221
7     100
8      89
9      54
Name: count, dtype: int64

In [231]:
df3.loc[:, 'price_per_sq_feet']=df3['price']*100000 / df3['total_sqft']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.loc[:, 'price_per_sq_feet']=df3['price']*100000 / df3['total_sqft']


In [232]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13294 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   location           13294 non-null  object 
 1   size               13294 non-null  object 
 2   total_sqft         13290 non-null  float64
 3   bath               13294 non-null  float64
 4   price              13294 non-null  float64
 5   bhk                13294 non-null  int64  
 6   price_per_sq_feet  13290 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 830.9+ KB


In [233]:
df3.drop(['size'],axis=1)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sq_feet
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...
13315,Whitefield,3453.0,4.0,231.00,5,6689.834926
13316,Richards Town,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4689.0,4.0,488.00,4,10407.336319


In [234]:
df3.loc[:, 'location']=df3['location'].apply(lambda x:x.strip())
location_count=df3['location'].value_counts()

In [235]:
location_count_less_10= location_count[location_count<=10]
location_count_less_10

location
Nagappa Reddy Layout         10
Ganga Nagar                  10
Sector 1 HSR Layout          10
Kalkere                      10
Basapura                     10
                             ..
Sarvobhogam Nagar             1
Prasanna layout Herohalli     1
Kanakapur main road           1
Sindhi Colony                 1
Masjid e Alkareem             1
Name: count, Length: 1044, dtype: int64

In [236]:
df3.loc[:, 'location']=df3['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [237]:
df3['location'].value_counts()

location
other                        2869
Whitefield                    541
Sarjapur  Road                399
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Marsur                         11
2nd Phase Judicial Layout      11
Thyagaraja Nagar               11
HAL 2nd Stage                  11
Name: count, Length: 242, dtype: int64

In [238]:
df4=df3[((df3['total_sqft']/df3['bhk']) >= 300)]

In [239]:
df4.shape

(12558, 7)

In [240]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for location, subdf in df.groupby('location'):  # Unpack the group key and DataFrame
        m = np.mean(subdf['price_per_sq_feet'])
        st = np.std(subdf['price_per_sq_feet'])
        gen_df = subdf[(subdf['price_per_sq_feet'] > (m - st)) & (subdf['price_per_sq_feet'] <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

df5 = remove_outliers_sqft(df4)


In [241]:
df5.shape

(10315, 7)

In [242]:
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sq_feet
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [243]:
def bhk_outlier_removal(df):
    exclude_indices = np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats={}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean':np.mean(bhk_df.price_per_sq_feet),
                'std':np.std(bhk_df.price_per_sq_feet),
                'count':bhk_df.shape[0]
            }

        for bhk,bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sq_feet<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

df6=bhk_outlier_removal(df5)

In [244]:
# df1.shape
# df6 = df6.drop(['price_per_sq_feet','size'],axis=1)
df6.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sq_feet
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [245]:
df6.shape

(7364, 7)

In [246]:
df6.to_csv('data/cleaned_new.csv', index=False)


In [251]:
unique_locations = df6['location'].unique()
print(unique_locations)
df_unique_locations = pd.DataFrame(unique_locations, columns=['location'])

# Save to CSV
df_unique_locations.to_csv('data/unique_locations.csv', index=False)

['1st Block Jayanagar' '1st Phase JP Nagar' '2nd Phase Judicial Layout'
 '2nd Stage Nagarbhavi' '5th Block Hbr Layout' '5th Phase JP Nagar'
 '6th Phase JP Nagar' '7th Phase JP Nagar' '8th Phase JP Nagar'
 '9th Phase JP Nagar' 'AECS Layout' 'Abbigere' 'Akshaya Nagar'
 'Ambalipura' 'Ambedkar Nagar' 'Amruthahalli' 'Anandapura' 'Ananth Nagar'
 'Anekal' 'Anjanapura' 'Ardendale' 'Arekere' 'Attibele' 'BEML Layout'
 'BTM 2nd Stage' 'BTM Layout' 'Babusapalaya' 'Badavala Nagar' 'Balagere'
 'Banashankari' 'Banashankari Stage II' 'Banashankari Stage III'
 'Banashankari Stage V' 'Banashankari Stage VI' 'Banaswadi'
 'Banjara Layout' 'Bannerghatta' 'Bannerghatta Road' 'Basavangudi'
 'Basaveshwara Nagar' 'Battarahalli' 'Begur' 'Begur Road' 'Bellandur'
 'Benson Town' 'Bharathi Nagar' 'Bhoganhalli' 'Billekahalli' 'Binny Pete'
 'Bisuvanahalli' 'Bommanahalli' 'Bommasandra'
 'Bommasandra Industrial Area' 'Bommenahalli' 'Brookefield' 'Budigere'
 'CV Raman Nagar' 'Chamrajpet' 'Chandapura' 'Channasandra'
 'Ch

# Break Here 

In [248]:
df6 = df5[df5.price < 1000]
df6.head()


Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sq_feet
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [249]:
# Calculate price per square foot and round to 2 decimal points
df6['price_per_sqft'] = ((df6['price'] * 100000) / df6['total_sqft']).round(2)

# Display the first few rows of the dataframe
df6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6['price_per_sqft'] = ((df6['price'] * 100000) / df6['total_sqft']).round(2)


Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sq_feet,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386,15017.54
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491,11901.84
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333,12533.33
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333,10833.33
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668,11983.81


In [250]:
df6['availability_binary'] = df6['availability'].apply(lambda x: 1 if x == 'Ready To Move' else 0)
print(df6['availability_binary'].value_counts())
df6.head()

KeyError: 'availability'

In [None]:
df7 = df6.drop('availability', axis='columns')
df7.head()
df7.shape


(13225, 8)

In [None]:
def standardize_location(name):
    return name.lower().strip().replace(' ', '')

df7['standardized_location'] = df7['location'].apply(standardize_location)

In [None]:
exact_duplicates = df7[df7.duplicated(['standardized_location'], keep=False)]


In [None]:
# Find near duplicates
def find_near_duplicates(locations, threshold=90):
    near_duplicates = {}
    for loc in locations:
        matches = process.extract(loc, locations, scorer=fuzz.token_sort_ratio)
        near_duplicates[loc] = [match for match in matches if match[1] >= threshold and match[0] != loc]
    return near_duplicates

unique_locations = df7['standardized_location'].unique()
near_duplicates = find_near_duplicates(unique_locations, threshold=95)
near_duplicates = {key: value for key, value in near_duplicates.items() if value}

In [None]:
replacement_dict = {}
for location, duplicates in near_duplicates.items():
  all_occurrences = [location] + [d[0] for d in duplicates]
  most_frequent = max(set(all_occurrences), key=all_occurrences.count)
  replacement_dict[location] = most_frequent
  for duplicate in duplicates:
    replacement_dict[duplicate[0]] = most_frequent

In [None]:
# Replace near-duplicate location names in the DataFrame
df7['standardized_location'] = df7['standardized_location'].replace(replacement_dict)

In [None]:
df8 = df7.drop('location', axis='columns')
df8.head()
df8['standardized_location'].value_counts()


standardized_location
whitefield                        541
sarjapurroad                      405
electroniccity                    304
kanakapuraroad                    277
thanisandra                       237
                                 ... 
4bedroomfarmhouseinbagalur          1
jakkasandra                         1
gulakamale                          1
bemllayout,rajarajeshwarinagar      1
duddanahalli                        1
Name: count, Length: 1184, dtype: int64

In [None]:
df8.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft,availability_binary,standardized_location
0,1056.0,2.0,1.0,39.07,2.0,3699.81,0,electroniccityphaseii
1,2600.0,5.0,3.0,120.0,4.0,4615.38,1,chikkatirupathi
2,1440.0,2.0,3.0,62.0,3.0,4305.56,1,uttarahalli
3,1521.0,3.0,1.0,95.0,3.0,6245.89,1,lingadheeranahalli
4,1200.0,2.0,1.0,51.0,2.0,4250.0,1,kothanur


In [None]:
# Log transformation for large values
df8['log_total_sqft'] = np.log1p(df8['total_sqft'])
df8['log_price'] = np.log1p(df8['price'])
df8['log_price_per_sqft'] = np.log1p(df8['price_per_sqft'])

# Standardization
scaler = StandardScaler()
df8['std_total_sqft'] = scaler.fit_transform(df8[['total_sqft']])
df8['std_price'] = scaler.fit_transform(df8[['price']])
df8['std_price_per_sqft'] = scaler.fit_transform(df8[['price_per_sqft']])

# Normalization
normalizer = MinMaxScaler()
df8['norm_total_sqft'] = normalizer.fit_transform(df8[['total_sqft']])
df8['norm_price'] = normalizer.fit_transform(df8[['price']])
df8['norm_price_per_sqft'] = normalizer.fit_transform(df8[['price_per_sqft']])

In [None]:
# Calculate 1st and 99th percentiles
low_percentile = df8['log_price_per_sqft'].quantile(0.01)
high_percentile = df8['log_price_per_sqft'].quantile(0.99)

# Remove outliers
df = df8[(df8['log_price_per_sqft'] >= low_percentile) & (df8['log_price_per_sqft'] <= high_percentile)]

# 2. Remove total sqft outliers

# Standardize total_sqft
df8['standardized_total_sqft'] = stats.zscore(df['total_sqft'])

# Remove points beyond 3 standard deviations
df8 = df8[np.abs(df8['standardized_total_sqft']) <= 3]

# Reset index after removing rows
df8 = df8.reset_index(drop=True)

print(f"Shape of DataFrame after outlier removal: {df8.shape}")

Shape of DataFrame after outlier removal: (12670, 18)


In [None]:
# Filter rows based on conditions
df9 = df8[~((df8['bath'] >= 4) & (df8['total_sqft'] < 2000))]
df9.shape

(11857, 18)

In [None]:
df10 = df9[~(df9['total_sqft'] < 300)]
df10.shape


(11853, 18)

In [None]:
# mean_prices = df10.groupby('standardized_location')['price'].mean()

# # Map the mean prices to the original DataFrame
# df10['location_encoded'] = df10['standardized_location'].map(mean_prices)

# print(df10)

In [None]:
# # Calculate the global mean of the target variable
# global_mean = df10['price'].mean()

# # Define the smoothing parameter
# m = 3

# # Apply smoothing to the target encoding
# def smooth_mean_encoding(col, target, m, global_mean):
#     agg = df10.groupby(col)[target].agg(['mean', 'count'])
#     mean_encoded = (agg['count'] * agg['mean'] + m * global_mean) / (agg['count'] + m)
#     return mean_encoded

# # Apply the smooth mean encoding to the 'standardized_location' column
# mean_encoded = smooth_mean_encoding('standardized_location', 'price', m, global_mean)
# df10['location_encoded'] = df10['standardized_location'].map(mean_encoded)

# print(df10)


In [None]:
df10.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,log_total_sqft,log_price,std_total_sqft,std_price,norm_total_sqft,norm_price,standardized_total_sqft
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2.0,6.96319,3.690628,-0.051716,-0.637914,0.000807,0.031479,-0.565018
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4.0,7.863651,4.795791,0.056086,0.131356,0.001989,0.113475,1.339606
2,Uttarahalli,1440.0,2.0,3.0,62.0,3.0,7.273093,4.143135,-0.024905,-0.419955,0.001101,0.054711,-0.091329
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3.0,7.327781,4.564348,-0.01925,-0.106278,0.001163,0.088146,0.00859
4,Kothanur,1200.0,2.0,1.0,51.0,2.0,7.09091,3.951244,-0.041662,-0.524514,0.000918,0.043566,-0.387385


In [None]:
# drop the following columns standardized_location	log_total_sqft	log_price	std_total_sqft	std_price	std_price_per_sqft	norm_total_sqft	norm_price	norm_price_per_sqft	log_price_per_sqft	standardized_total_sqft

# df11 = df10.drop(['standardized_location', 'log_total_sqft', 'log_price', 'std_total_sqft',
#                    'std_price', 'std_price_per_sqft', 'norm_total_sqft', 'norm_price',
#                    'norm_price_per_sqft', 'log_price_per_sqft', 'standardized_total_sqft'], axis='columns')
# df11.head(30)

df11 = df10.drop([ 'log_total_sqft', 'log_price', 'std_total_sqft',
                   'std_price', 'norm_total_sqft', 'norm_price',
                    'standardized_total_sqft'], axis='columns')
# df11.head(30)



In [None]:
df11.shape

(11853, 6)

In [None]:


# Save DataFrame to CSV with a relative path
df11.to_csv('data/cleaned.csv', index=False)
