In [3]:
#Hunter Ma 2023/5 studying in SEU
#以下内容为jupyter notebook内容的合并，我会发布另一个notebook的版本，所以建议拿这个缝合版一步一步运行

#Importing the basic librarires
#import 基本的包，后面用到再说
import math
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

#from brokenaxes import brokenaxes
from statsmodels.formula import api
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,6]

#warnings这个包比较有意思，我的理解是他会过滤掉很多因python版本不兼容而现实的复杂但不必要的报错提醒，引入下面的这个函数就会屏蔽掉这些内容
import warnings 
warnings.filterwarnings('ignore')



In [4]:
#pd.read_csv()函数读取一个CSV文件并将其转换成一个DataFrame对象df，至于什么是DataFrame对象，这个可以自行百度(非常简单)
df = pd.read_csv('../song_data.csv')

#df.drop()删除了数据的一列，该列的名称为'song_name'，axis=1代表是列，inplace=True代表删除操作是在原数据上进行的
df.drop(['song_name'], axis=1, inplace=True)

#display顾名思义， .head()为dataframe结构的函数，展示该数据的前五行内容
display(df.head())



Unnamed: 0,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574


In [5]:
#将我们需要拟合的label拿出并赋值target
target = 'song_popularity'

#将除了target外的特征赋值存入features
features = [i for i in df.columns if i not in [target]]



In [6]:
#df的原拷贝，便于后续处理，deep=True意味着拷贝df中的全部内容：它与原始数据df具有相同的数据结构和内容，
# 但是在内存中存储的位置不同，即它们是两个独立的对象。同时，由于使用了deep=True参数，所以在创建original_df对象时，
# 它的所有数据和索引都被完整复制了一份，而不是只复制一个对原始数据的引用。这样做的目的通常是为了避免在对original_df进行操作时，
# 对原始数据df造成影响。这在数据处理和机器学习任务中非常重要，因为在处理数据时，我们往往需要多次尝试不同的方法和算法，
# 而避免修改原始数据可以确保数据的完整性和可重复性。
original_df = df.copy(deep=True)

#print上述的一些信息，这里的{}对应后面的.format内容，即{}内的内容为df.shape[1], df.shape[0]，可以立即为完形填空
print('\n\033[1mInference:\033[0m The Datset consists of {} features & {} samples.'.format(df.shape[1], df.shape[0]))




[1mInference:[0m The Datset consists of 14 features & 18835 samples.


In [7]:
#.info()函数，查看df的具体信息，如包含的数据量，每行每列数据的类型(dtype)等等
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18835 entries, 0 to 18834
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song_popularity   18835 non-null  int64  
 1   song_duration_ms  18835 non-null  int64  
 2   acousticness      18835 non-null  float64
 3   danceability      18835 non-null  float64
 4   energy            18835 non-null  float64
 5   instrumentalness  18835 non-null  float64
 6   key               18835 non-null  int64  
 7   liveness          18835 non-null  float64
 8   loudness          18835 non-null  float64
 9   audio_mode        18835 non-null  int64  
 10  speechiness       18835 non-null  float64
 11  tempo             18835 non-null  float64
 12  time_signature    18835 non-null  int64  
 13  audio_valence     18835 non-null  float64
dtypes: float64(9), int64(5)
memory usage: 2.0 MB


In [8]:
#.unique()函数返回每个特征中唯一值的数量，而sort_values()函数则将这些值按照从小到大的顺序排序。
#由于nunique()方法返回的是一个Series对象，因此可以直接使用sort_values()方法对其进行排序。
# 最终输出的结果是一个包含数据集中每个特征唯一值数量的Series对象，其中索引是特征名，值是唯一值数量。
# 这可以帮助我们快速了解数据集中每个特征的分布情况，从而为后续的数据分析和建模工作提供参考。
df.nunique().sort_values()

#首先计算数据集df中除目标变量外的每个特征的唯一值数量，并将结果按照从小到大的顺序排序。
nu = df[features].nunique().sort_values()

#nf,数值特征：cf，分类特征
nf = []; cf = []; nnf = 0; ncf = 0; #numerical & categorical features

#接着，代码遍历所有特征，将唯一值数量小于等于16的特征视为分类特征，将唯一值数量大于16的特征视为数值特征。
for i in range(df[features].shape[1]):
    if nu.values[i]<=16:cf.append(nu.index[i])
    else: nf.append(nu.index[i])

#打印inference，具体同上述内容
print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(nf),len(cf)))

#df.describe()函数用于查询df的具体统计特征，如总和，平均值，标准差等...
display(df.describe())



audio_mode              2
time_signature          5
key                    12
song_popularity       101
danceability          849
energy               1132
speechiness          1224
audio_valence        1246
liveness             1425
acousticness         3209
instrumentalness     3925
loudness             8416
song_duration_ms    11771
tempo               12112
dtype: int64

In [9]:
#首先计算数据集df中除目标变量外的每个特征的唯一值数量，并将结果按照从小到大的顺序排序。
nu = df[features].nunique().sort_values()

#nf,数值特征：cf，分类特征
nf = []; cf = []; nnf = 0; ncf = 0; #numerical & categorical features

#接着，代码遍历所有特征，将唯一值数量小于等于16的特征视为分类特征，将唯一值数量大于16的特征视为数值特征。
for i in range(df[features].shape[1]):
    if nu.values[i]<=16:cf.append(nu.index[i])
    else: nf.append(nu.index[i])

#打印inference，具体同上述内容
print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(nf),len(cf)))

#df.describe()函数用于查询df的具体统计特征，如总和，平均值，标准差等...
display(df.describe())




[1mInference:[0m The Datset has 10 numerical & 3 categorical features.


Unnamed: 0,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
count,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0,18835.0
mean,52.991877,218211.6,0.258539,0.633348,0.644995,0.078008,5.289196,0.17965,-7.447435,0.628139,0.102099,121.073154,3.959119,0.527967
std,21.905654,59887.54,0.288719,0.156723,0.214101,0.221591,3.614595,0.143984,3.827831,0.483314,0.104378,28.714456,0.298533,0.244632
min,0.0,12000.0,1e-06,0.0,0.00107,0.0,0.0,0.0109,-38.768,0.0,0.0,0.0,0.0,0.0
25%,40.0,184339.5,0.0241,0.533,0.51,0.0,2.0,0.0929,-9.044,0.0,0.0378,98.368,4.0,0.335
50%,56.0,211306.0,0.132,0.645,0.674,1.1e-05,5.0,0.122,-6.555,1.0,0.0555,120.013,4.0,0.527
75%,69.0,242844.0,0.424,0.748,0.815,0.00257,8.0,0.221,-4.908,1.0,0.119,139.931,4.0,0.725
max,100.0,1799346.0,0.996,0.987,0.999,0.997,11.0,0.986,1.585,1.0,0.941,242.318,5.0,0.984
