# 数据导入

In [1]:
import pandas as pd
import nltk
import re

In [2]:
data_path = '/Volumes/时伟SSD512G/China-Central_Asia_Summit.csv'
df = pd.read_csv(data_path, encoding='utf8')

In [3]:
df.head()

Unnamed: 0,author_username,in_reply_to_username,retweeted_username,quoted_username,date,time,text,public_metrics.impression_count,public_metrics.reply_count,public_metrics.retweet_count,public_metrics.quote_count,public_metrics.like_count,author.created_at,author.public_metrics.followers_count,author.public_metrics.following_count,author.public_metrics.tweet_count,author.verified
0,thehousejapan,,yasminalombaert,,2023/5/20,23:59:00.000Z,A real bad weekend for pariah Putin.\n\nThe Sa...,78144,110,468,31,2183,2016-03-11T05:51:16.000Z,68,1110,18221,False
1,jamilsiddiq3,,ChinaDaily,,2023/5/20,23:58:45.000Z,Chinese State Councilor and Foreign Minister Q...,9288,12,40,2,133,2014-04-23T17:13:59.000Z,1347,4985,54243,False
2,MyronGainez_,,Gerashchenko_en,,2023/5/20,23:58:24.000Z,Congratulations to the great geopolitical stra...,1504896,583,2853,226,15819,2021-10-06T06:44:53.000Z,284,135,5821,False
3,dragonfire55555,,yasminalombaert,,2023/5/20,23:58:03.000Z,A real bad weekend for pariah Putin.\n\nThe Sa...,78144,110,468,31,2183,2011-01-27T02:52:30.000Z,822,261,239212,False
4,SokoVera,,Gerashchenko_en,,2023/5/20,23:57:32.000Z,Congratulations to the great geopolitical stra...,1504896,583,2853,226,15819,2020-01-22T20:08:09.000Z,12,108,2867,False


# 数据非常多，先提取需要的列

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16350 entries, 0 to 16349
Data columns (total 17 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   author_username                        16350 non-null  object
 1   in_reply_to_username                   173 non-null    object
 2   retweeted_username                     13437 non-null  object
 3   quoted_username                        328 non-null    object
 4   date                                   16350 non-null  object
 5   time                                   16350 non-null  object
 6   text                                   16350 non-null  object
 7   public_metrics.impression_count        16350 non-null  int64 
 8   public_metrics.reply_count             16350 non-null  int64 
 9   public_metrics.retweet_count           16350 non-null  int64 
 10  public_metrics.quote_count             16350 non-null  int64 
 11  public_metrics.

# 可以开始数据预处理了

In [5]:
df["text"] = df["text"].str.lower()  # 统一为小写字母

In [6]:
# 导入nltk的分词工具word_tokenize和nltk的停用词表stopwords，需要自行下载哦
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")

In [7]:
# 定义一个函数，处理Twitter文本中的特殊符号：@xxx、&xxx、网址、多余的空格，最后只保留数字和字母
def clean_specialtext(tweet):
    tweet = re.sub(r"@([^\s:]+)", " ", tweet)  # 去除@提及内容
    tweet = re.sub(r"&([^\s:]+)", " ", tweet)  # 去除&及之后的内容
    tweet = re.sub(r"[a-zA-z]+://[^\s]*", " ", tweet)  # 去除网址
    tweet = re.sub(r"[^0-9A-Za-z]", " ", tweet)  # 仅保留数字及字母
    tweet = re.sub(r"(^\s*)|(\s*$)", "", tweet)  # 去除字符串前后空格
    # tweet = re.sub(r'[^0-9A-Za-z]', ' ', tweet)
    return tweet

In [8]:
df["text"] = df["text"].apply(lambda x: clean_specialtext(x))

In [9]:
#导入nltk的词性标注工具pos_tag、词性属性工具wordnet和词形还原工具WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

In [10]:
# 这一步是为了更精确词形还原，对词性进行标注，精确的词性对词形还原很有必要
# 参考https://stackoverflow.com/questions/61982023/using-wordnetlemmatizer-lemmatize-with-pos-tags-throws-keyerror
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None

In [12]:
# 定义去除停用词和词形还原函数，每一步都是有顺序的，不能乱
def clean_stopwords_Lemmatization(tweet):
    tweet = word_tokenize(tweet)
    # print(tweet)
    # print("*"*100)
    tweet = [word for word in tweet if word not in stopwords_list]  # 去除停用词
    tweet = pos_tag(tweet)  # 词性标注
    # 词性标注好之后，列表里变成了（词，词性）的元组形式，word[0]是词的位置，对word[0]进行词形还原，依据是上面的函数和word[1]
    tweet = [
        lemmatizer.lemmatize(word[0], pos=get_wordnet_pos(word[1]))
        if get_wordnet_pos(word[1])
        else word[0]
        for word in tweet
    ]
    tweet = [word for word in tweet if len(word) > 2]  # 丢弃长度小于等于2的字符串
    tweet = " ".join(tweet)  # 将tweet列表中的字符串重新拼接起来，用空格分隔
    # print(tweet)
    # print("#"*100)
    return tweet

In [13]:
df['text'] = df['text'].apply(lambda x: clean_stopwords_Lemmatization(x))

In [14]:
df.head()

Unnamed: 0,author_username,in_reply_to_username,retweeted_username,quoted_username,date,time,text,public_metrics.impression_count,public_metrics.reply_count,public_metrics.retweet_count,public_metrics.quote_count,public_metrics.like_count,author.created_at,author.public_metrics.followers_count,author.public_metrics.following_count,author.public_metrics.tweet_count,author.verified
0,thehousejapan,,yasminalombaert,,2023/5/20,23:59:00.000Z,real bad weekend pariah putin nthe saudi invit...,78144,110,468,31,2183,2016-03-11T05:51:16.000Z,68,1110,18221,False
1,jamilsiddiq3,,ChinaDaily,,2023/5/20,23:58:45.000Z,chinese state councilor foreign minister qin g...,9288,12,40,2,133,2014-04-23T17:13:59.000Z,1347,4985,54243,False
2,MyronGainez_,,Gerashchenko_en,,2023/5/20,23:58:24.000Z,congratulation great geopolitical strategist p...,1504896,583,2853,226,15819,2021-10-06T06:44:53.000Z,284,135,5821,False
3,dragonfire55555,,yasminalombaert,,2023/5/20,23:58:03.000Z,real bad weekend pariah putin nthe saudi invit...,78144,110,468,31,2183,2011-01-27T02:52:30.000Z,822,261,239212,False
4,SokoVera,,Gerashchenko_en,,2023/5/20,23:57:32.000Z,congratulation great geopolitical strategist p...,1504896,583,2853,226,15819,2020-01-22T20:08:09.000Z,12,108,2867,False


In [23]:
df.to_csv('/Volumes/时伟SSD512G/CCAS清洗完成.csv', encoding='utf8')