In [13]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import sklearn
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest 
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, accuracy_score, recall_score 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor

# Read TSV file
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
# Drop rows with missing values
train.dropna(inplace=True)
# 处理异常值
train = train[(train['revenue'] > 0) & (train['revenue'] < 1e6)]
train = train[(train['rating'] != 0)] # rating is between 1 and 5
# remove the stop word in ATM_Location_TYPE
stop_words = text.ENGLISH_STOP_WORDS.union(['Only', 'and'])
train['ATM_Location_TYPE'] = train['ATM_Location_TYPE'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test['ATM_Location_TYPE'] = test['ATM_Location_TYPE'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# add feature
train['perhouse'] = train['Estimated_Number_of_Houses_in_1_KM_Radius'] / train['No_of_Other_ATMs_in_1_KM_radius']
test['perhouse'] = test['Estimated_Number_of_Houses_in_1_KM_Radius'] / test['No_of_Other_ATMs_in_1_KM_radius']

train

Unnamed: 0,Number_of_Shops_Around_ATM,ATM_Zone,No_of_Other_ATMs_in_1_KM_radius,Estimated_Number_of_Houses_in_1_KM_Radius,ATM_Placement,ATM_TYPE,ATM_Location_TYPE,ATM_looks,ATM_Attached_to,Average_Wait_Time,Day_Type,rating,revenue,perhouse
0,66,RL,65,8450,Facing Road,Urban,WIthdraw,Normal,Building,3,Working,4,209500,130.000000
1,26,RL,80,9600,Facing Road,Urban,WIthdraw,Normal,Building,3,Working,3,184300,120.000000
2,65,RL,68,11250,Facing Road,Town,WIthdraw,Normal,Building,3,Working,4,231500,165.441176
3,80,RL,60,9550,Facing Road,Town,WIthdraw,Normal,Building,3,Working,4,143600,159.166667
4,66,RL,84,14260,Facing Road,Town,WIthdraw,Normal,Building,4,Working,4,255600,169.761905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150206,30,RL,79,9790,Facing Road,Urban,WIthdraw,Normal,Petrol Bunk,3,Working,3,149500,123.924051
150207,22,RL,65,36500,Facing Road,Town,Checkdrop Withdraw,New,Building,4,Working,3,197200,561.538462
150208,123,RL,40,5664,Facing Road,Town,WIthdraw,Normal,Building,2,Working,5,278700,141.600000
150209,64,RL,86,11065,Facing Road,Town,WIthdraw,Normal,Building,3,Working,5,282800,128.662791


In [14]:

# split the ATM_Location_TYPE column
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


train['ATM_Location_TYPE'] = train['ATM_Location_TYPE'].str.split(' ')
train = train.explode('ATM_Location_TYPE')
# Encode the string values to numeric values
# columns = ['ATM_Zone', 'ATM_Placement', 'ATM_TYPE', 'ATM_Location_TYPE', 'ATM_looks', 'ATM_Attached_to', 'Day_Type'] need to be encoded
cols_to_transform = [ 'ATM_Zone', 'ATM_Placement', 'ATM_TYPE', 'ATM_Location_TYPE', 'ATM_looks', 'ATM_Attached_to', 'Day_Type']

# define the LabelEncoder
le = LabelEncoder()
# apply the LabelEncoder to the columns
for col in cols_to_transform:
    le.fit(train[col].unique().tolist() + test[col].unique().tolist())
    
    # 将每个数据集中的列转换为整数
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# Standardizing/normalizing data
#scaler = StandardScaler()
#train[['Estimated_Number_of_Houses_in_1_KM_Radius']] = scaler.fit_transform(train[['Estimated_Number_of_Houses_in_1_KM_Radius']])
#test[['Estimated_Number_of_Houses_in_1_KM_Radius']] = scaler.fit_transform(test[['Estimated_Number_of_Houses_in_1_KM_Radius']])

# PolynomialFeatures
#poly = PolynomialFeatures(degree=2, include_bias=False)
#train_poly = poly.fit_transform(train.drop(['revenue'], axis=1))
#train_poly = pd.DataFrame(train_poly, columns=poly.get_feature_names(train.columns[:-1]))

# add new feature
train['ATM_Zone'] = train['ATM_Zone'] * train['ATM_Placement']
test['ATM_Zone'] = test['ATM_Zone'] * test['ATM_Placement']
#train['ATM_ty'] = train['ATM_TYPE'] * train['ATM_Location_TYPE']
#test['ATM_ty'] = test['ATM_TYPE'] * test['ATM_Location_TYPE']
train['ATM_looks_attach'] = train['ATM_looks'] * train['ATM_Attached_to']
test['ATM_looks_attach'] = test['ATM_looks'] * test['ATM_Attached_to']
train['ATM_looks_type'] = train['ATM_looks'] * train['ATM_TYPE']
test['ATM_looks_type'] = test['ATM_looks'] * test['ATM_TYPE']

# Divide the data into independent variables and dependent variables.
X = train.drop(['rating'], axis=1)
Y = train['rating']
# show the full result
pd.set_option('display.max_columns', None)
X

Unnamed: 0,Number_of_Shops_Around_ATM,ATM_Zone,No_of_Other_ATMs_in_1_KM_radius,Estimated_Number_of_Houses_in_1_KM_Radius,ATM_Placement,ATM_TYPE,ATM_Location_TYPE,ATM_looks,ATM_Attached_to,Average_Wait_Time,Day_Type,revenue,perhouse,ATM_looks_attach,ATM_looks_type
0,66,0,65,8450,0,3,7,1,0,3,3,209500,130.000000,0,3
1,26,0,80,9600,0,3,7,1,0,3,3,184300,120.000000,0,3
2,65,0,68,11250,0,2,7,1,0,3,3,231500,165.441176,0,2
3,80,0,60,9550,0,2,7,1,0,3,3,143600,159.166667,0,2
4,66,0,84,14260,0,2,7,1,0,4,3,255600,169.761905,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150207,22,0,65,36500,0,2,0,0,0,4,3,197200,561.538462,0,0
150207,22,0,65,36500,0,2,8,0,0,4,3,197200,561.538462,0,0
150208,123,0,40,5664,0,2,7,1,0,2,3,278700,141.600000,0,2
150209,64,0,86,11065,0,2,7,1,0,3,3,282800,128.662791,0,2


In [15]:
# 向后消除法特征选择
while True:
    # 拟合模型
    model = sm.OLS(Y, sm.add_constant(X)).fit()
    # 计算每个特征的p值
    p_values = model.pvalues.iloc[1:]
    print(p_values)
    # 选择p值最大的特征
    max_p_value = p_values.max()
    # 如果最大的p值大于0.05，则删除该特征
    if max_p_value > 0.05:
        feature_to_drop = p_values.idxmax()
        X = X.drop([feature_to_drop], axis=1)
        test = test.drop([feature_to_drop], axis=1)
    # 如果最大的p值小于等于0.05，停止迭代
    else:
        break
X

Number_of_Shops_Around_ATM                   6.246400e-149
ATM_Zone                                      5.442570e-03
No_of_Other_ATMs_in_1_KM_radius               8.198906e-69
Estimated_Number_of_Houses_in_1_KM_Radius     1.433570e-58
ATM_Placement                                 4.315639e-35
ATM_TYPE                                      1.018589e-34
ATM_Location_TYPE                             2.295672e-27
ATM_looks                                     3.314444e-12
ATM_Attached_to                               1.316285e-33
Average_Wait_Time                             0.000000e+00
Day_Type                                      0.000000e+00
revenue                                       0.000000e+00
perhouse                                      8.488265e-01
ATM_looks_attach                              1.343046e-09
ATM_looks_type                                6.862149e-31
dtype: float64
Number_of_Shops_Around_ATM                   7.389522e-151
ATM_Zone                                 

Unnamed: 0,Number_of_Shops_Around_ATM,ATM_Zone,No_of_Other_ATMs_in_1_KM_radius,Estimated_Number_of_Houses_in_1_KM_Radius,ATM_Placement,ATM_TYPE,ATM_Location_TYPE,ATM_looks,ATM_Attached_to,Average_Wait_Time,Day_Type,revenue,ATM_looks_attach,ATM_looks_type
0,66,0,65,8450,0,3,7,1,0,3,3,209500,0,3
1,26,0,80,9600,0,3,7,1,0,3,3,184300,0,3
2,65,0,68,11250,0,2,7,1,0,3,3,231500,0,2
3,80,0,60,9550,0,2,7,1,0,3,3,143600,0,2
4,66,0,84,14260,0,2,7,1,0,4,3,255600,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150207,22,0,65,36500,0,2,0,0,0,4,3,197200,0,0
150207,22,0,65,36500,0,2,8,0,0,4,3,197200,0,0
150208,123,0,40,5664,0,2,7,1,0,2,3,278700,0,2
150209,64,0,86,11065,0,2,7,1,0,3,3,282800,0,2


In [16]:
# 训练随机森林分类器
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

for i in range(20, 50, 2):

    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X, Y)

    # 预测测试集评级
    y_pred = clf.predict(test.drop(['rating'], axis=1))
    accuracy = accuracy_score(test['rating'], y_pred)
    print(str(i) + ', ' + str(accuracy))

y_pred = pd.DataFrame(y_pred, columns=['rating'])
y_pred.to_csv('z5320711.PART2.output.csv', index=False)

20, 0.9996473906911142
22, 0.9996473906911142
24, 0.9996473906911142
26, 0.9996473906911142
28, 0.9996473906911142
30, 0.9996473906911142
32, 0.9996473906911142
34, 0.9996473906911142
36, 0.9996473906911142
38, 0.9996473906911142
40, 0.9996473906911142
42, 0.9996473906911142
44, 0.9996473906911142
46, 0.9996473906911142
48, 0.9996473906911142


In [17]:

# 评估模型性能
accuracy = accuracy_score(test['rating'], y_pred)
accuracy

# 计算每个类别的精确度，召回率和F1分数
precision, recall, f1, _ = precision_recall_fscore_support(test['rating'], y_pred, average=None)

# 计算加权平均精度，召回率和F1分数
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(test['rating'], y_pred, average='weighted')

print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)
print('Weighted Precision:', weighted_precision)
print('Weighted Recall:', weighted_recall)
print('Weighted F1:', weighted_f1)

Precision: [1.         0.99930216 1.         1.        ]
Recall: [1.         1.         0.99911894 1.        ]
F1: [1.         0.99965096 0.99955928 1.        ]
Weighted Precision: 0.999647636754833
Weighted Recall: 0.9996473906911142
Weighted F1: 0.9996473745270371
