# 写入日志操作

In [1]:
import datetime

class Logger:
    def __init__(self, log_file="operation_log.txt"):
        self.log_file = log_file

    def log(self, message, console=True):
        """记录日志"""
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_message = f"[{timestamp}] {message}"

        # 输出到控制台
        if console:
            print(log_message)

        # 写入文件
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(log_message + "\n")

    def log_separator(self):
        """添加分隔线"""
        separator = "=" * 50
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(separator + "\n")

In [4]:
logger = Logger("StudyLog.txt")

# 统计空缺值

In [13]:
import  pandas as pd
df = pd.read_csv("./DataSet/session_04_data.csv",sep=',')
df.head(10)
# logger.log(f"{df}")

Unnamed: 0.1,Unnamed: 0,ID,Name,gender,Age,Salary ($),Hire Date,Country,Weight
0,0,1,Alice,F,25,50000.0,2023-01-10,USA,70kg
1,1,2,Bob,M,Thirty,60000.0,2022-12-05,U.S.A.,154lbs
2,2,3,CHARLIE,M,40,,01/15/2021,UK,65kg
3,3,4,Dave,M,22,70000.0,"March 1, 2020",United Kingdom,140lbs
4,4,5,Eve,F,,80000.0,2021-07-20,canada,75kg
5,5,6,Frank,M,35,120000.0,08/10/2019,Canada,165lbs
6,6,7,George,M,29,,2020-05-30,Germany,80kg
7,7,8,Hannah,F,33,150000.0,"July 4, 2018",France,180lbs
8,8,9,Isaac,M,45,200000.0,06-01-2017,France,90kg
9,9,10,Jack,M,Twenty,250000.0,2016-09-15,Brazil,200lbs


In [14]:
# 参数解释：axis=1表示删除列，inplace=True表示直接修改DataFrame
df.drop('Unnamed: 0',axis=1,inplace=True)
logger.log_separator()
logger.log(f"参数解释：axis=1表示删除列，inplace=True表示直接修改DataFrame")

[2026-01-25 11:32:20] 参数解释：axis=1表示删除列，inplace=True表示直接修改DataFrame


In [15]:
df.head(10)

Unnamed: 0,ID,Name,gender,Age,Salary ($),Hire Date,Country,Weight
0,1,Alice,F,25,50000.0,2023-01-10,USA,70kg
1,2,Bob,M,Thirty,60000.0,2022-12-05,U.S.A.,154lbs
2,3,CHARLIE,M,40,,01/15/2021,UK,65kg
3,4,Dave,M,22,70000.0,"March 1, 2020",United Kingdom,140lbs
4,5,Eve,F,,80000.0,2021-07-20,canada,75kg
5,6,Frank,M,35,120000.0,08/10/2019,Canada,165lbs
6,7,George,M,29,,2020-05-30,Germany,80kg
7,8,Hannah,F,33,150000.0,"July 4, 2018",France,180lbs
8,9,Isaac,M,45,200000.0,06-01-2017,France,90kg
9,10,Jack,M,Twenty,250000.0,2016-09-15,Brazil,200lbs


In [17]:
df.columns.tolist()
logger.log(df.columns.tolist())

[2026-01-25 11:35:56] ['ID', 'Name', 'gender', 'Age', 'Salary ($)', 'Hire Date', 'Country', 'Weight']


# 字段换名称

In [23]:
df.rename(columns={
    'ID' : '编号',
    'Name': '姓名',
    'gender' : '性别',
    'Age': '年龄',
    'Salary ($)': '薪资',
    'Hire Date' : '入职时间',
    'Country' : '国家',
    'Weight' :'体重'
},inplace=True)

In [25]:
df["姓名"]

0       Alice
1       Bob  
2     CHARLIE
3        Dave
4         Eve
5       Frank
6      George
7      Hannah
8       Isaac
9        Jack
10        Eve
11      Frank
Name: 姓名, dtype: object

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   编号      12 non-null     int64  
 1   姓名      12 non-null     object 
 2   性别      12 non-null     object 
 3   年龄      10 non-null     object 
 4   薪资      10 non-null     float64
 5   入职时间    12 non-null     object 
 6   国家      12 non-null     object 
 7   体重      12 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 896.0+ bytes


In [29]:
# 查看数据类型
df.dtypes


编号        int64
姓名       object
性别       object
年龄       object
薪资      float64
入职时间     object
国家       object
体重       object
dtype: object

In [31]:
# 查看空缺值
df.isnull().sum()
logger.log(f"{df.isnull().sum() }")

[2026-01-25 12:05:34] 编号      0
姓名      0
性别      0
年龄      2
薪资      2
入职时间    0
国家      0
体重      0
dtype: int64


In [32]:
# 查询重复行数据
df.duplicated().sum()

2

In [35]:
df[df.duplicated()]
df.drop_duplicates(inplace=True)
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,Alice,F,25,50000.0,2023-01-10,USA,70kg
1,2,Bob,M,Thirty,60000.0,2022-12-05,U.S.A.,154lbs
2,3,CHARLIE,M,40,,01/15/2021,UK,65kg
3,4,Dave,M,22,70000.0,"March 1, 2020",United Kingdom,140lbs
4,5,Eve,F,,80000.0,2021-07-20,canada,75kg
5,6,Frank,M,35,120000.0,08/10/2019,Canada,165lbs
6,7,George,M,29,,2020-05-30,Germany,80kg
7,8,Hannah,F,33,150000.0,"July 4, 2018",France,180lbs
8,9,Isaac,M,45,200000.0,06-01-2017,France,90kg
9,10,Jack,M,Twenty,250000.0,2016-09-15,Brazil,200lbs


In [39]:
# 处理数据类型
df.dtypes # 查看数据类型
print(df['年龄'].unique())

['25' 'Thirty' '40' '22' nan '35' '29' '33' '45' 'Twenty']


In [40]:
df['年龄'] = df.年龄.replace({
    'Thirty' : 30,
    'Twenty' : 20
})

In [41]:
print(df['年龄'].unique())

['25' 30 '40' '22' nan '35' '29' '33' '45' 20]


In [44]:
df['年龄'] = pd.to_numeric(df['年龄'],errors='coerce') #参数errors='coerce'表示将无法转换的数据转换为NaN
df.info()
df.年龄.unique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   编号      10 non-null     int64  
 1   姓名      10 non-null     object 
 2   性别      10 non-null     object 
 3   年龄      9 non-null      float64
 4   薪资      8 non-null      float64
 5   入职时间    10 non-null     object 
 6   国家      10 non-null     object 
 7   体重      10 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 720.0+ bytes


array([25., 30., 40., 22., nan, 35., 29., 33., 45., 20.])

In [51]:
df.入职时间.unique()
df["入职时间"] = pd.to_datetime(df.入职时间,errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   编号      10 non-null     int64         
 1   姓名      10 non-null     object        
 2   性别      10 non-null     object        
 3   年龄      9 non-null      float64       
 4   薪资      8 non-null      float64       
 5   入职时间    10 non-null     datetime64[ns]
 6   国家      10 non-null     object        
 7   体重      10 non-null     object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 720.0+ bytes


In [52]:
from dateutil import parser as ps
def date_parser(date):
    if pd.notna(date):
        return ps.parse(date)
    return date

<module 'dateutil.parser' from 'D:\\ProgramData\\Anaconda3\\lib\\site-packages\\dateutil\\parser\\__init__.py'>


In [58]:
df.性别
df.性别.unique()
df['性别'] = df['性别'].astype('category') # 核心就是改变数据类型为category 分类模型
df.info()
df.性别.unique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   编号      10 non-null     int64         
 1   姓名      10 non-null     object        
 2   性别      10 non-null     category      
 3   年龄      9 non-null      float64       
 4   薪资      8 non-null      float64       
 5   入职时间    10 non-null     datetime64[ns]
 6   国家      10 non-null     object        
 7   体重      10 non-null     object        
dtypes: category(1), datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 746.0+ bytes


[F, M]
Categories (2, object): [F, M]

In [62]:
df.姓名 = df['姓名'].str.strip().str.upper()
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,ALICE,F,25.0,50000.0,2023-01-10,USA,70kg
1,2,BOB,M,30.0,60000.0,2022-12-05,U.S.A.,154lbs
2,3,CHARLIE,M,40.0,,2021-01-15,UK,65kg
3,4,DAVE,M,22.0,70000.0,2020-03-01,United Kingdom,140lbs
4,5,EVE,F,,80000.0,2021-07-20,canada,75kg
5,6,FRANK,M,35.0,120000.0,2019-08-10,Canada,165lbs
6,7,GEORGE,M,29.0,,2020-05-30,Germany,80kg
7,8,HANNAH,F,33.0,150000.0,2018-07-04,France,180lbs
8,9,ISAAC,M,45.0,200000.0,2017-06-01,France,90kg
9,10,JACK,M,20.0,250000.0,2016-09-15,Brazil,200lbs


In [64]:
df.国家.unique()

array(['USA', 'U.S.A.', 'UK', 'United Kingdom', 'canada', 'Canada',
       'Germany', 'France', 'Brazil'], dtype=object)

In [71]:
countries_dic ={
    'USA':'美国',
    'UK':'英国',
    'U.S.A.':'美国',
    'canada':'加拿大',
    'Japan':'日本',
    'China':'中国',
    'Germany':'德国',
    'France':'法国',
    'Italy':'意大利',
    'Australia':'澳大利亚',
    'South Korea':'韩国',
    'New Zealand':'新西兰',
    'United Kingdom':'英国',
     'Canada':'加拿大',
    "Brazil":"巴西"
}
df.国家 = df.国家.replace(countries_dic).str.title()
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,ALICE,F,25.0,50000.0,2023-01-10,美国,70kg
1,2,BOB,M,30.0,60000.0,2022-12-05,美国,154lbs
2,3,CHARLIE,M,40.0,,2021-01-15,英国,65kg
3,4,DAVE,M,22.0,70000.0,2020-03-01,英国,140lbs
4,5,EVE,F,,80000.0,2021-07-20,加拿大,75kg
5,6,FRANK,M,35.0,120000.0,2019-08-10,加拿大,165lbs
6,7,GEORGE,M,29.0,,2020-05-30,德国,80kg
7,8,HANNAH,F,33.0,150000.0,2018-07-04,法国,180lbs
8,9,ISAAC,M,45.0,200000.0,2017-06-01,法国,90kg
9,10,JACK,M,20.0,250000.0,2016-09-15,巴西,200lbs


In [73]:
def convert_weight(item):
    if not isinstance(item,str): # 判断是否为字符串
        return item
    if 'kg' in item:
        return float(item.replace('kg',''))
    elif 'lb' in item:
        return float(item.replace('lbs',''))*0.453592
df['体重'] = df['体重'].apply(convert_weight)
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,ALICE,F,25.0,50000.0,2023-01-10,美国,70.0
1,2,BOB,M,30.0,60000.0,2022-12-05,美国,69.853168
2,3,CHARLIE,M,40.0,,2021-01-15,英国,65.0
3,4,DAVE,M,22.0,70000.0,2020-03-01,英国,63.50288
4,5,EVE,F,,80000.0,2021-07-20,加拿大,75.0
5,6,FRANK,M,35.0,120000.0,2019-08-10,加拿大,74.84268
6,7,GEORGE,M,29.0,,2020-05-30,德国,80.0
7,8,HANNAH,F,33.0,150000.0,2018-07-04,法国,81.64656
8,9,ISAAC,M,45.0,200000.0,2017-06-01,法国,90.0
9,10,JACK,M,20.0,250000.0,2016-09-15,巴西,90.7184


# 删除空缺值

# 填充空缺值

In [86]:
mean_salary = df['薪资'].mean()
mean_salary

122500.0

In [77]:
df.薪资.fillna(mean_salary,inplace=True)
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,ALICE,F,25.0,50000.0,2023-01-10,美国,70.0
1,2,BOB,M,30.0,60000.0,2022-12-05,美国,69.853168
2,3,CHARLIE,M,40.0,122500.0,2021-01-15,英国,65.0
3,4,DAVE,M,22.0,70000.0,2020-03-01,英国,63.50288
4,5,EVE,F,,80000.0,2021-07-20,加拿大,75.0
5,6,FRANK,M,35.0,120000.0,2019-08-10,加拿大,74.84268
6,7,GEORGE,M,29.0,122500.0,2020-05-30,德国,80.0
7,8,HANNAH,F,33.0,150000.0,2018-07-04,法国,81.64656
8,9,ISAAC,M,45.0,200000.0,2017-06-01,法国,90.0
9,10,JACK,M,20.0,250000.0,2016-09-15,巴西,90.7184


In [80]:
medAge = df.年龄.median()
medAge
df.年龄.fillna(medAge,inplace=True)
df

Unnamed: 0,编号,姓名,性别,年龄,薪资,入职时间,国家,体重
0,1,ALICE,F,25.0,50000.0,2023-01-10,美国,70.0
1,2,BOB,M,30.0,60000.0,2022-12-05,美国,69.853168
2,3,CHARLIE,M,40.0,122500.0,2021-01-15,英国,65.0
3,4,DAVE,M,22.0,70000.0,2020-03-01,英国,63.50288
4,5,EVE,F,30.0,80000.0,2021-07-20,加拿大,75.0
5,6,FRANK,M,35.0,120000.0,2019-08-10,加拿大,74.84268
6,7,GEORGE,M,29.0,122500.0,2020-05-30,德国,80.0
7,8,HANNAH,F,33.0,150000.0,2018-07-04,法国,81.64656
8,9,ISAAC,M,45.0,200000.0,2017-06-01,法国,90.0
9,10,JACK,M,20.0,250000.0,2016-09-15,巴西,90.7184


# 使用预测方法进行填充空缺值