In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'color': ['red', 'blue', 'green']})
one_hot = pd.get_dummies(df['color'])

In [3]:
one_hot

Unnamed: 0,blue,green,red
0,False,False,True
1,True,False,False
2,False,True,False


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['color_encoded'] = le.fit_transform(df['color'])

In [5]:
df

Unnamed: 0,color,color_encoded
0,red,2
1,blue,0
2,green,1


In [6]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({'size': ['small', 'medium', 'large']})
ord_enc = OrdinalEncoder(categories=[['small', 'medium', 'large']])
df['size_encoded'] = ord_enc.fit_transform(df[['size']])


In [7]:
df

Unnamed: 0,size,size_encoded
0,small,0.0
1,medium,1.0
2,large,2.0


In [8]:
import pandas as pd

df = pd.DataFrame({'city': ['New York', 'Los Angeles', 'New York', 'Chicago']})
frequency_encoding = df['city'].value_counts().to_dict()
df['city_encoded'] = df['city'].map(frequency_encoding)

In [9]:
df

Unnamed: 0,city,city_encoded
0,New York,2
1,Los Angeles,1
2,New York,2
3,Chicago,1


In [13]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

# DataFrame with city names
df = pd.DataFrame({'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago', 'Los Angeles']})

# Wrapping each city name in a list
cities = df['city'].apply(lambda x: [x])

# Initialize FeatureHasher
hasher = FeatureHasher(input_type='string', n_features=10)

# Transform the city names
hashed_features = hasher.transform(cities)

# Convert the hashed features to a DataFrame
hashed_df = pd.DataFrame(hashed_features.toarray())

# Concatenate the original DataFrame with the hashed features
df = pd.concat([df, hashed_df], axis=1)

print(df)



          city    0    1    2    3    4    5    6    7    8    9
0     New York  0.0  0.0  0.0  0.0 -1.0  0.0  0.0  0.0  0.0  0.0
1  Los Angeles  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 -1.0
2      Chicago  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0
3     New York  0.0  0.0  0.0  0.0 -1.0  0.0  0.0  0.0  0.0  0.0
4      Chicago  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0
5  Los Angeles  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 -1.0


In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Sample DataFrame with numerical features
df = pd.DataFrame({
    'age': [25, 32, 47, 51, 62],
    'income': [50000, 64000, 120000, 97000, 62000]
})



print(df)


   age  income
0   25   50000
1   32   64000
2   47  120000
3   51   97000
4   62   62000


In [15]:
# Choose a scaler
scaler = StandardScaler()  # or MinMaxScaler(), RobustScaler()

# Fit and transform the numerical features
df[['age', 'income']] = scaler.fit_transform(df[['age', 'income']])

In [16]:
df

Unnamed: 0,age,income
0,-1.382872,-1.1034
1,-0.85678,-0.563274
2,0.270562,1.59723
3,0.571186,0.70988
4,1.397904,-0.640435


In [28]:
import pandas as pd

df = pd.read_csv("Data/Final_Clean_Dataset.csv")

In [29]:
def preprocess_states_data(df):
    df['state'] = df['state'].str.lower().str.strip()
    abbreviated_states_for_queensland = ['qld', 'gld', 'q', 'gladston', 'gladstone', 'ald', 'queensland', 'davao del sur', 'west gladstone', 'ald','4019', 'qld 4020','4504', 'brookfield', 'north lakes', 'clifton beach','new auckland', 'clontarf', 'margate', 'kippa ring', 'burua', 'kippa-ring', 'tannum sands',]
    abbreviated_states_for_tasmania = ['y', 'tasmania', 'tas','test',  ]
    abbreviated_states_for_new_south_wales = ['nsw', 'nt', 'newport', 'new south wales','nz']
    abbreviated_states_for_victoria = ['vic', 'victoria', 'voc', 'bvic', 'vic 3340','viv',  'vicq', 'vic 3337', 'eynesbury', '3352', 'maddingley', 'hopetoun park','bacchus marsh', 'strathtulloh', 'melton west', 'victoria vic', 'melton south', 'melbourne', 'darley', 'sunbury']
    abbreviated_states_for_western_australia = ['wa', 'western australia', 'scarborough', ]
    abbreviated_states_for_south_australia = ['sa', 'act']
    df['state'] = df['state'].replace(abbreviated_states_for_queensland, 'queensland')
    df['state'] = df['state'].replace(abbreviated_states_for_tasmania, 'tasmania')
    df['state'] = df['state'].replace(abbreviated_states_for_new_south_wales, 'new south wales')
    df['state'] = df['state'].replace(abbreviated_states_for_victoria, 'victoria')
    df['state'] = df['state'].replace(abbreviated_states_for_western_australia, 'western australia')
    df['state'] = df['state'].replace(abbreviated_states_for_south_australia, 'south australia')
    return df

In [35]:
df['state'] = preprocess_states_data(df)['state']

In [36]:
df['state'].value_counts()

state
queensland           109839
victoria              31815
new south wales         816
western australia        62
south australia          34
tasmania                 22
Name: count, dtype: int64