### Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import re
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Import all data files

In [4]:
train_data = pd.read_csv("train.csv")
dist_data = pd.read_csv('dist_from_city_centre.csv')
rent_data = pd.read_csv('avg_rent.csv')
test_data = pd.read_csv("test.csv")

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            10656 non-null  int64  
 1   area_type     10656 non-null  object 
 2   availability  10656 non-null  object 
 3   location      10655 non-null  object 
 4   size          10642 non-null  object 
 5   society       6228 non-null   object 
 6   total_sqft    10656 non-null  object 
 7   bath          10591 non-null  float64
 8   balcony       10152 non-null  float64
 9   price         10656 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 832.6+ KB


In [6]:
train_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0


In [7]:
dist_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   location        500 non-null    object 
 1   dist_from_city  500 non-null    float64
dtypes: float64(1), object(1)
memory usage: 7.9+ KB


In [8]:
dist_data.head(3)

Unnamed: 0,location,dist_from_city
0,Whitefield,17.3
1,Sarjapur Road,17.2
2,Electronic City,18.1


In [9]:
rent_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   location       157 non-null    object
 1   avg_2bhk_rent  157 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.6+ KB


In [10]:
rent_data.head()

Unnamed: 0,location,avg_2bhk_rent
0,Krishnarajapura,11954
1,Sarjapur,45000
2,Whitefield Hope Farm Junction,26370
3,Devanahalli,17302
4,Whitefield,14981


In [11]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2664 entries, 0 to 2663
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            2664 non-null   int64  
 1   area_type     2664 non-null   object 
 2   availability  2664 non-null   object 
 3   location      2664 non-null   object 
 4   size          2662 non-null   object 
 5   society       1590 non-null   object 
 6   total_sqft    2664 non-null   object 
 7   bath          2656 non-null   float64
 8   balcony       2559 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 187.4+ KB


In [12]:
test_data.head()

Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0
3,3,Built-up Area,Ready To Move,Jalahalli,2 BHK,,1000,2.0,0.0
4,4,Plot Area,Ready To Move,TC Palaya,1 Bedroom,,1350,1.0,0.0


### Common Data Processing functions

In [14]:
# Delete non-preferable rows and columns
def remove_NonPreferable_Rows_Columns(data) :
    #data = data.dropna(subset=['location'])
    #data = data.dropna(subset=['size'])
    data['size'] = data['size'].fillna('Unknown')
    data['location'] = data['location'].fillna('Unknown')
    data = data.drop('society', axis=1)
    return data

In [15]:
# Convert the 'size' values to uniform in nature
def convert_Size_Uniformely(data) :
    data['size'] = data['size'].str.replace('Bedroom', 'BHK')
    return data

In [16]:
def process_total_sqft(value):
    value = str(value.strip().lower().replace(" ", ""))
    if '-' in value:
        # Split the string at the hyphen, strip spaces, and calculate the average
        parts = value.split('-')
        part1 = float(parts[0].strip())  # Trim spaces and convert to float
        part2 = float(parts[1].strip())  # Trim spaces and convert to float
        avg_value = (part1 + part2) / 2
        return str(avg_value)
    elif value.endswith(".0"):
        return value[:-2]
    elif (value.endswith("acres") or value.endswith("acre") or value.endswith("acr") or value.endswith("acrs")):
        value = value.replace("acres", "").replace("acre", "").replace("acr", "").replace("acrs", "")
        return str(float(value) * 43560)
    elif (value.endswith("sq.meter") or value.endswith("sq.mtr") or value.endswith("sqmeter") or value.endswith("sqmtr")):
        value = value.replace("sq.meter", "").replace("sq.mtr", "").replace("sqmeter", "").replace("sqmtr", "")
        return str(float(value) * 10.7639)
    elif (value.endswith("perch") or value.endswith("prch")):
        value = value.replace("perch", "").replace("prch", "")
        return str(float(value) * 272.25)
    elif (value.endswith("sq.yards") or value.endswith("sq.yard") or value.endswith("sqyrd") or value.endswith("sqyrds")):
        value = value.replace("sq.yards", "").replace("sq.yard", "").replace("sqyrd", "").replace("sqyrds", "")
        return str(float(value) * 9)
    elif (value.endswith("cents") or value.endswith("cent") or value.endswith("cnts") or value.endswith("cnt")):
        value = value.replace("cents", "").replace("cent", "").replace("cnts", "").replace("cnt", "")
        return str(float(value) * 435.6)
    elif (value.endswith("gunthas") or value.endswith("guntha") or value.endswith("guntas") or value.endswith("gunta")):
        value = value.replace("gunthas", "").replace("guntha", "").replace("guntas", "").replace("gunta", "")
        return str(float(value) * 1089)
    elif (value.endswith("grounds") or value.endswith("ground") or value.endswith("grnds") or value.endswith("grnd")):
        value = value.replace("grounds", "").replace("ground", "").replace("grnds", "").replace("grnd", "")
        return str(float(value) * 2400)
    else:
        return value

In [17]:
def convert_uniform_totalSqrt(data) :
    non_integer_values = data[data['total_sqft'].apply(lambda x: not str(x).isdigit())]
    data['total_sqft'] = data['total_sqft'].apply(process_total_sqft)
    data['total_sqft'] = data['total_sqft'].apply(process_total_sqft)
    data['total_sqft'] = data['total_sqft'].astype(float).round().astype(int)
    return data

In [18]:
# Catgeories based on total square feet
def generate_sqft_cat_column(data) :
    data['total_sqft_category'] = pd.cut(
        data['total_sqft'], 
        bins=[0, 49, 100, 500, 1000, 5000, 10000, 15000, 25000, 50000, 100000, 200000, 500000, 1000000, 2000000, 5000000, 10000000, 20000000],
        labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
        include_lowest=True
    )
    data['total_sqft_category'] = data['total_sqft_category'].astype(int)
    return data

In [19]:
def avg_null_bath_values(data):
    if isinstance(data.index, pd.MultiIndex):
        data.reset_index(inplace=True)
    grouped = data.groupby(['area_type', 'total_sqft_category', 'size'])
    data['bath'] = grouped['bath'].transform(lambda x: x.fillna(x.mean()))
    print(data['bath'].isnull().sum())
    grouped = data.groupby(['area_type', 'total_sqft_category'])
    data['bath'] = grouped['bath'].transform(lambda x: x.fillna(x.mean()))
    data['bath'] = data['bath'].astype(float).round().astype(int)
    return data;

In [20]:
def avg_null_balcony_values(data):
    if isinstance(data.index, pd.MultiIndex):
        data.reset_index(inplace=True)
    grouped = data.groupby(['area_type', 'total_sqft_category', 'size'])
    data['balcony'] = grouped['balcony'].transform(lambda x: x.fillna(x.mean()))
    print(data['balcony'].isnull().sum())
    data['balcony'] = data['balcony'].astype(float).round()

    if isinstance(data.index, pd.MultiIndex):
        data.reset_index(inplace=True)
    grouped = data.groupby(['area_type', 'total_sqft_category'])
    data['balcony'] = grouped['balcony'].transform(lambda x: x.fillna(x.mean()))
    print(data['balcony'].isnull().sum())
    data['balcony'] = data['balcony'].astype(float).round()

    if isinstance(data.index, pd.MultiIndex):
        data.reset_index(inplace=True)
    grouped = data.groupby(['area_type', 'size'])
    data['balcony'] = grouped['balcony'].transform(lambda x: x.fillna(x.mean()))
    print(data['balcony'].isnull().sum())
    data['balcony'] = data['balcony'].astype(float).round()
    
    return data;

### Data pre processing- Stage 1: train.csv and test.csv

In [22]:
processed_train_data = remove_NonPreferable_Rows_Columns(train_data)
processed_train_data = convert_Size_Uniformely(processed_train_data)
processed_train_data = convert_uniform_totalSqrt(processed_train_data)
processed_train_data = generate_sqft_cat_column(processed_train_data)
processed_train_data = avg_null_bath_values(processed_train_data)
processed_train_data = avg_null_balcony_values(processed_train_data)
processed_train_data.head(3)
processed_train_data.to_csv("processed_train_data_stg1.csv", index=False)

14
30
1
0


In [23]:
processed_test_data = remove_NonPreferable_Rows_Columns(test_data)
processed_test_data = convert_Size_Uniformely(processed_test_data)
processed_test_data = convert_uniform_totalSqrt(processed_test_data)
processed_test_data = generate_sqft_cat_column(processed_test_data)
processed_test_data = avg_null_bath_values(processed_test_data)
processed_test_data = avg_null_balcony_values(processed_test_data)
processed_test_data.head(3)
processed_test_data.to_csv("processed_test_data_stg1.csv", index=False)

2
7
0
0


In [24]:
missing_count = processed_train_data.isnull().sum()
print(missing_count)

ID                     0
area_type              0
availability           0
location               0
size                   0
total_sqft             0
bath                   0
balcony                0
price                  0
total_sqft_category    0
dtype: int64


In [25]:
missing_count = processed_test_data.isnull().sum()
print(missing_count)

ID                     0
area_type              0
availability           0
location               0
size                   0
total_sqft             0
bath                   0
balcony                0
total_sqft_category    0
dtype: int64


### Data pre processing- Stage 2: Merge Primary and Secondary data sets and process

In [27]:
# Catgeories based on total square feet
def generate_distance_cat_column(data) :
    data['dist_from_city_cat'] = pd.cut(
        data['dist_from_city'], 
        bins=[0, 2, 5, 8, 13, 20, 30, 50, 70, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000],
        labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
        include_lowest=True
    )
    data['dist_from_city_cat'] = data['dist_from_city_cat'].astype(int)
    return data

In [28]:
def fill_null_avg_2bhk_rent(data):
    def first_level(data) :
        if isinstance(data.index, pd.MultiIndex):
            data.reset_index(inplace=True)
        grouped = data.groupby(['dist_from_city_cat'])
        data['avg_2bhk_rent'] = grouped['avg_2bhk_rent'].transform(lambda x: x.fillna(x.mean()))
        data['avg_2bhk_rent'] = data['avg_2bhk_rent'].astype(float).round()
        return data;
    data = first_level(data)
    data['avg_2bhk_rent'] = data['avg_2bhk_rent'].fillna(data['avg_2bhk_rent'].mean())
    return data

In [29]:
processed_train_data = processed_train_data.merge(dist_data, on='location', how='left')
processed_train_data = processed_train_data.merge(rent_data, on='location', how='left')
processed_train_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,price,total_sqft_category,dist_from_city,avg_2bhk_rent
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1.0,39.07,5,19.3,11500.0
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 BHK,2600,5,3.0,120.0,5,34.6,
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3.0,62.0,5,12.9,19750.0


In [30]:
processed_train_data['dist_from_city'] = processed_train_data['dist_from_city'].fillna(processed_train_data['dist_from_city'].mean())
processed_train_data = generate_distance_cat_column(processed_train_data)
processed_train_data = fill_null_avg_2bhk_rent(processed_train_data)
processed_train_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,price,total_sqft_category,dist_from_city,avg_2bhk_rent,dist_from_city_cat
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1.0,39.07,5,19.3,11500.0,5
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 BHK,2600,5,3.0,120.0,5,34.6,17412.0,7
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3.0,62.0,5,12.9,19750.0,4


In [31]:
processed_test_data = processed_test_data.merge(dist_data, on='location', how='left')
processed_test_data = processed_test_data.merge(rent_data, on='location', how='left')
processed_test_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,total_sqft_category,dist_from_city,avg_2bhk_rent
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,650,1,1.0,4,6.7,15875.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,1370,2,1.0,5,11.0,
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,1725,3,2.0,5,17.3,14981.0


In [32]:
processed_test_data['dist_from_city'] = processed_test_data['dist_from_city'].fillna(processed_test_data['dist_from_city'].mean())
processed_test_data = generate_distance_cat_column(processed_test_data)
processed_test_data = fill_null_avg_2bhk_rent(processed_test_data)
processed_test_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,total_sqft_category,dist_from_city,avg_2bhk_rent,dist_from_city_cat
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,650,1,1.0,4,6.7,15875.0,3
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,1370,2,1.0,5,11.0,15727.0,4
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,1725,3,2.0,5,17.3,14981.0,5


In [33]:
processed_train_data.to_csv("processed_train_data_stg2.csv", index=False)
processed_test_data.to_csv("processed_test_data_stg2.csv", index=False)

In [34]:
missing_count = processed_train_data.isnull().sum()
print(missing_count)

ID                     0
area_type              0
availability           0
location               0
size                   0
total_sqft             0
bath                   0
balcony                0
price                  0
total_sqft_category    0
dist_from_city         0
avg_2bhk_rent          0
dist_from_city_cat     0
dtype: int64


In [35]:
missing_count = processed_train_data.isnull().sum()
print(missing_count)

ID                     0
area_type              0
availability           0
location               0
size                   0
total_sqft             0
bath                   0
balcony                0
price                  0
total_sqft_category    0
dist_from_city         0
avg_2bhk_rent          0
dist_from_city_cat     0
dtype: int64


### Data pre processing- Stage 3: Further Data encoding

In [37]:
def availability_generalize(value):
    value = str(value)
    if '-' in value:
        return value.split('-')[1].strip().lower()
    if 'ready to move' in value:
        return "ready to move in"
    return value.strip().lower()

In [38]:
processed_train_data['availability'] = processed_train_data['availability'].apply(availability_generalize)
processed_test_data['availability'] = processed_test_data['availability'].apply(availability_generalize)

In [39]:
processed_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10656 non-null  int64  
 1   area_type            10656 non-null  object 
 2   availability         10656 non-null  object 
 3   location             10656 non-null  object 
 4   size                 10656 non-null  object 
 5   total_sqft           10656 non-null  int32  
 6   bath                 10656 non-null  int32  
 7   balcony              10656 non-null  float64
 8   price                10656 non-null  float64
 9   total_sqft_category  10656 non-null  int32  
 10  dist_from_city       10656 non-null  float64
 11  avg_2bhk_rent        10656 non-null  float64
 12  dist_from_city_cat   10656 non-null  int32  
dtypes: float64(4), int32(4), int64(1), object(4)
memory usage: 915.9+ KB


In [40]:
processed_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2664 entries, 0 to 2663
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2664 non-null   int64  
 1   area_type            2664 non-null   object 
 2   availability         2664 non-null   object 
 3   location             2664 non-null   object 
 4   size                 2664 non-null   object 
 5   total_sqft           2664 non-null   int32  
 6   bath                 2664 non-null   int32  
 7   balcony              2664 non-null   float64
 8   total_sqft_category  2664 non-null   int32  
 9   dist_from_city       2664 non-null   float64
 10  avg_2bhk_rent        2664 non-null   float64
 11  dist_from_city_cat   2664 non-null   int32  
dtypes: float64(3), int32(4), int64(1), object(4)
memory usage: 208.3+ KB


In [41]:
#categorical_columns = processed_train_data.select_dtypes(include=['object']).columns
categorical_columns = ['area_type', 'availability', 'location', 'size']
categorical_columns = [col for col in categorical_columns if col in processed_train_data.columns and col in processed_test_data.columns]

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(processed_train_data[categorical_columns])

train_encoded = ohe.transform(processed_train_data[categorical_columns])
test_encoded = ohe.transform(processed_test_data[categorical_columns])

feature_names = ohe.get_feature_names_out(categorical_columns)

train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)

processed_train_data = processed_train_data.drop(columns=categorical_columns).reset_index(drop=True)
processed_test_data = processed_test_data.drop(columns=categorical_columns).reset_index(drop=True)
processed_train_data = pd.concat([processed_train_data, train_encoded_df], axis=1)
processed_test_data = pd.concat([processed_test_data, test_encoded_df], axis=1)

In [42]:
processed_train_data.to_csv("processed_train_data_stg3.csv", index=False)
processed_test_data.to_csv("processed_test_data_stg3.csv", index=False)

In [43]:
processed_test_data.head(3)

Unnamed: 0,ID,total_sqft,bath,balcony,total_sqft_category,dist_from_city,avg_2bhk_rent,dist_from_city_cat,area_type_Built-up Area,area_type_Carpet Area,...,size_27 BHK,size_3 BHK,size_4 BHK,size_43 BHK,size_5 BHK,size_6 BHK,size_7 BHK,size_8 BHK,size_9 BHK,size_Unknown
0,0,650,1,1.0,4,6.7,15875.0,3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1370,2,1.0,5,11.0,15727.0,4,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1725,3,2.0,5,17.3,14981.0,5,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model Building

In [45]:
y = processed_train_data['price']
X = processed_train_data.drop(['ID', 'price'], axis=1)

In [46]:
from sklearn.model_selection import train_test_split

# Split data into training and temporary sets (70% train, 30% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the temporary set into validation (20%) and test sets (80%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.8, random_state=42)

In [47]:
param_grid = {'fit_intercept': [True, False]}

# Create a linear regression model object
model = LinearRegression()

# Create a GridSearchCV object for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

<IPython.core.display.Javascript object>

### Model Validation and tuning

In [49]:
# Evaluate the best model on the validation set
y_pred_val = best_model.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

# Print validation set performance metrics
print(f"Validation MSE: {mse_val}")
print(f"Validation RMSE: {rmse_val}")
print(f"Validation MAE: {mae_val}")
print(f"Validation R-squared: {r2_val}")

Validation MSE: 8878.15092599706
Validation RMSE: 94.22394030179942
Validation MAE: 50.19884566272742
Validation R-squared: 0.44982393519497654


In [50]:
# Evaluate the best model on the unseen test set
y_pred_test = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print test set performance metrics
print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")
print(f"Test R-squared: {r2_test}")

Test MSE: 12155.541812281555
Test RMSE: 110.25217373041474
Test MAE: 50.86066540130608
Test R-squared: 0.35660039769017304


In [51]:
# Extract features from test data (no target column)
testdata_ID = processed_test_data['ID']
X_test = processed_test_data.drop(['ID'], axis=1)

# Create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Save the predictions (you can save this to a file)
predictions_df = pd.DataFrame({'price': y_pred})
merged_df = pd.concat([testdata_ID, predictions_df], axis=1)
merged_df.to_csv("Submission_Data02.csv", index=False) 

<IPython.core.display.Javascript object>

Unnamed: 0,ID,price
0,0,134.809130
1,1,72.045733
2,2,116.051457
3,3,8.104364
4,4,113.495560
...,...,...
2659,2659,243.898869
2660,2660,330.935626
2661,2661,53.549005
2662,2662,228.558934


In [52]:
'test2'

'test2'