In [1]:
# 导入包

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score 

In [2]:
# 导入清洗后的数据集

df_clean = pd.read_csv("data_cleaned.csv")
df_clean.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2014-03-12,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382.0,,,Referral,2014-03-17,Closed with explanation,Yes,No,759217
1,2017-01-19,Student loan,Federal student loan servicing,Dealing with the lender or servicer,Received bad information about my loan,When my loan was switched over to Navient i wa...,,"Navient Solutions, LLC.",LA,,,Consent provided,Web,2017-01-19,Closed with explanation,Yes,No,2296496
2,2018-04-06,Credit card/Prepaid card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,I tried to sign up for a spending monitoring p...,,CAPITAL ONE FINANCIAL CORPORATION,VA,,Older American,Consent provided,Web,2018-04-06,Closed with explanation,Yes,,2866101
3,2014-06-08,Credit card/Prepaid card,Credit card,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854.0,Older American,,Web,2014-06-10,Closed with explanation,Yes,Yes,885638
4,2014-09-13,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233.0,,,Web,2014-09-13,Closed with explanation,Yes,Yes,1027760


In [3]:
# 使用逻辑回归预测‘Consumer disputed?’

In [4]:
## 数据预处理

### 需要建立虚拟变量的列
dummy_cols = ['Product','Issue','Company public response','Company','State','Consumer consent provided?',
       'Submitted via','Company response to consumer','Timely response?','Consumer disputed?']

### 删除数据集中不用与建模的列
df_model_0 = df_clean.drop(['Date received',
                            'Sub-product',
                            'Sub-issue',
                            'Consumer complaint narrative',
                            'Complaint ID',
                            'Date sent to company',
                            'Tags',
                            'ZIP code'],
                             axis=1)

### 预处理1——去除缺失值

df_model_1 = df_model_0.dropna()
df_model_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25739 entries, 6 to 279955
Data columns (total 10 columns):
Product                         25739 non-null object
Issue                           25739 non-null object
Company public response         25739 non-null object
Company                         25739 non-null object
State                           25739 non-null object
Consumer consent provided?      25739 non-null object
Submitted via                   25739 non-null object
Company response to consumer    25739 non-null object
Timely response?                25739 non-null object
Consumer disputed?              25739 non-null object
dtypes: object(10)
memory usage: 2.2+ MB


In [11]:
### 预处理2—— 对分类变量创建虚拟变量

#### 对于建模分类变量，采取两种方法处理：
#### 类别变量或者二元变量，采取pd.get_dummies()转换为虚拟变量

dummy_cols = ['Submitted via','Timely response?','Consumer disputed?','Consumer consent provided?']

for col in dummy_cols:
    df_model_1[col] = df_model_1[col].astype('category')### object——>category
    
df_model = pd.get_dummies(df_model_1,columns=dummy_cols,drop_first=True)

#### 标称属性，即分类等级较多的变量，采取pd.factorize()将每个分类等级映射为唯一的数字表示

df_model['Product'] = pd.factorize(df_model['Product'])[0]
df_model['Issue'] = pd.factorize(df_model['Issue'])[0]
df_model['Company public response'] = pd.factorize(df_model['Company public response'])[0]
df_model['Company'] = pd.factorize(df_model['Company'])[0]
df_model['State'] = pd.factorize(df_model['State'])[0]
#df_model['ZIP code'] = pd.factorize(df_model['ZIP code'])[0]
df_model['Company response to consumer'] = pd.factorize(df_model['Company response to consumer'])[0]

df_model.info()
df_model.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25739 entries, 6 to 279955
Data columns (total 11 columns):
Product                                         25739 non-null int64
Issue                                           25739 non-null int64
Company public response                         25739 non-null int64
Company                                         25739 non-null int64
State                                           25739 non-null int64
Company response to consumer                    25739 non-null int64
Timely response?_Yes                            25739 non-null uint8
Consumer disputed?_Yes                          25739 non-null uint8
Consumer consent provided?_Consent provided     25739 non-null uint8
Consumer consent provided?_Consent withdrawn    25739 non-null uint8
Consumer consent provided?_Other                25739 non-null uint8
dtypes: int64(6), uint8(5)
memory usage: 1.5 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Index(['Product', 'Issue', 'Company public response', 'Company', 'State',
       'Company response to consumer', 'Timely response?_Yes',
       'Consumer disputed?_Yes', 'Consumer consent provided?_Consent provided',
       'Consumer consent provided?_Consent withdrawn',
       'Consumer consent provided?_Other'],
      dtype='object')

In [12]:
## 逻辑回归建模
cols = [i for i in df_model.columns if i != 'Consumer disputed?_Yes']

X = df_model[cols]
y = df_model['Consumer disputed?_Yes']

lr_model = LogisticRegression() 

In [13]:
## 模型训练

lr_model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
## 模型评估

lr_model.score(X,y)

0.828353859901317

In [15]:
## 模型验证——简单验证

### 将数据分为训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
### 使用训练集训练模型
lr_model.fit(X_train,y_train)
### 模型评分
print(lr_model.score(X_test,y_test)) 
### 模型预测
predict_test = lr_model.predict(X_test)
### 计算模型分类准确率
predict_results =np.in1d(predict_test,y_test.values)
sum(predict_results)/predict_results.size

0.8325563325563325


1.0

In [16]:
## 模型验证——k折交叉验证
print(np.mean(cross_val_score(lr_model,X,y,cv=10)))

0.828353879563657
