In [26]:
import os
# 解决OpenMP库冲突问题
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [27]:
column_names = ['tweet_id', 'entity', 'sentiment', 'tweet_content']
train_data = pd.read_csv('data/twitter_training.csv', 
                        names=column_names,  
                        encoding='utf-8')    

validation_data = pd.read_csv('data/twitter_validation.csv', 
                             names=column_names,
                             encoding='utf-8')

In [28]:
# 检查缺失值
print("缺失值统计:")
print("训练集:")
print(train_data.isnull().sum())
print("\n验证集:")
print(validation_data.isnull().sum())

# 检查重复值
print(f"\n重复行数量:")
print(f"训练集: {train_data.duplicated().sum()}")
print(f"验证集: {validation_data.duplicated().sum()}")

# 查看情感标签分布
print(f"\n情感标签分布:")
print("训练集:")
print(train_data['sentiment'].value_counts())
print("\n验证集:")
print(validation_data['sentiment'].value_counts())

缺失值统计:
训练集:
tweet_id           0
entity             0
sentiment          0
tweet_content    686
dtype: int64

验证集:
tweet_id         0
entity           0
sentiment        0
tweet_content    0
dtype: int64

重复行数量:
训练集: 2700
验证集: 0

情感标签分布:
训练集:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

验证集:
sentiment
Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: count, dtype: int64


In [29]:
import re
import string
from collections import Counter

def clean_text(text):
    """
    文本清理函数
    - 转换为小写
    - 移除URL、用户名、特殊字符
    - 移除多余空格
    """
    if pd.isna(text):
        return ""
    
    # 转换为小写
    text = text.lower()
    
    # 移除URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 移除用户名 (@username)
    text = re.sub(r'@\w+', '', text)
    
    # 移除标点符号（保留空格）
    text = re.sub(r'[^\w\s]', '', text)
    
    # 移除多余空格
    text = ' '.join(text.split())
    
    return text

train_data['cleaned_tweet'] = train_data['tweet_content'].apply(clean_text)
validation_data['cleaned_tweet'] = validation_data['tweet_content'].apply(clean_text)

# 查看清理前后的对比
for i in range(3):
    print(f"\n原文 {i+1}: {train_data['tweet_content'].iloc[-i]}")
    print(f"清理后: {train_data['cleaned_tweet'].iloc[-i]}")

# 统计词汇
all_words = []
for tweet in train_data['cleaned_tweet']:
    if tweet: 
        all_words.extend(tweet.split())

word_freq = Counter(all_words)
print(f"总词汇数: {len(all_words)}")
print(f"唯一词汇数: {len(word_freq)}")
print(f"前10个高频词: {word_freq.most_common(10)}")


原文 1: im getting on borderlands and i will murder you all ,
清理后: im getting on borderlands and i will murder you all

原文 2: Just like the windows partition of my Mac is like 6 years behind on its drivers So you have no idea how I didn’t notice
清理后: just like the windows partition of my mac is like 6 years behind on its drivers so you have no idea how i didnt notice

原文 3: Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice
清理后: just realized between the windows partition of my mac is like being 6 years behind on nvidia drivers and cars i have no fucking idea how i ever didn t notice
总词汇数: 1342695
唯一词汇数: 39744
前10个高频词: [('the', 44445), ('i', 29191), ('to', 28830), ('and', 26591), ('a', 24127), ('of', 19444), ('is', 17824), ('for', 15611), ('in', 15399), ('this', 14666)]


In [30]:
# 创建最终的处理后数据集
processed_train = train_data[['tweet_id', 'entity', 'sentiment', 'cleaned_tweet']].copy()
processed_validation = validation_data[['tweet_id', 'entity', 'sentiment', 'cleaned_tweet']].copy()

# 移除空的推文
processed_train = processed_train[processed_train['cleaned_tweet'].str.len() > 0]
processed_validation = processed_validation[processed_validation['cleaned_tweet'].str.len() > 0]

print(f"  - 训练集: {len(processed_train)} 条数据")
print(f"  - 验证集: {len(processed_validation)} 条数据")

# 保存处理后的数据
# processed_train.to_csv('data/processed_train.csv', index=False)
# processed_validation.to_csv('data/processed_validation.csv', index=False)


  - 训练集: 73681 条数据
  - 验证集: 1000 条数据
