In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stop_words.add('br')
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def preprocess_text(data):
    data['text'] = data['text'].apply(remove_stopwords)#Remove stopwords
    data['text'] = data['text'].apply(lambda x: re.sub('https?:\/\/.*[\r\n]*', ' ', x))#Remove URLs
    data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9 \n]', ' ', x))#Remove non-alphanumeric characters
    data['text'] = data['text'].apply(lambda x: re.sub('@[\w]*', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: re.sub('\d+', ' ', x))#Remove digits
    data['text'] = data['text'].apply(lambda x: re.sub('user', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))#Remove Short Words
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))#Remove punctuation
    data['text'] = data['text'].str.lower()
    return data

  data['text'] = data['text'].apply(lambda x: re.sub('https?:\/\/.*[\r\n]*', ' ', x))#Remove URLs
  data['text'] = data['text'].apply(lambda x: re.sub('@[\w]*', '', x))#Remove Twitter usernames
  data['text'] = data['text'].apply(lambda x: re.sub('\d+', ' ', x))#Remove digits
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

data=pd.read_csv('C:/Users/DELL/Downloads/twitter_E6oV3lV.csv')
data.columns=['id','label','text']
#print(data['label'].value_counts())
print("Class Distribution:")
print(data['label'].value_counts(normalize=True))

Class Distribution:
label
0    0.929854
1    0.070146
Name: proportion, dtype: float64


In [4]:
from sklearn.utils import resample

class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

majority_class = 0 if len(class_0) > len(class_1) else 1
minority_class = 1 - majority_class

#Downsamle
class_majority_downsampled = resample(class_0 if majority_class == 0 else class_1,
                                      replace=False,
                                      n_samples=len(class_1),
                                      random_state=42)
#Merge balanced data
balanced_data = pd.concat([class_majority_downsampled, class_1] if majority_class == 0 else [class_0, class_majority_downsampled])
#print(balanced_data['label'].value_counts())
print("Class Distribution:")
print(balanced_data['label'].value_counts(normalize=True))

Class Distribution:
label
0    0.5
1    0.5
Name: proportion, dtype: float64


In [6]:
from sklearn.model_selection import train_test_split
#Segment dataset
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)
train_data = pd.read_csv('C:/Users/DELL/Downloads/train_data.csv')
test_data = pd.read_csv('C:/Users/DELL/Downloads/test_data.csv')

train_data.columns=['id','label','text']
test_data.columns=['id','label','text']

train_data.head()

Unnamed: 0,id,label,text
0,5280,0,sad world... #orlando #tuerie #terrorism #usa
1,15986,1,@user i know. ref. to #malevote &amp; #womenvo...
2,6018,0,priority tou choro koi 2nd option tk ni banata...
3,27315,0,@user arr look at taylor on the slide!! ð ...
4,3139,1,@user you might be a libtard if... #libtard #...


In [7]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch




# 加载数据
train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)

# 初始化tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 数据预处理函数
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# 创建datasets
train_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_data).map(preprocess_function, batched=True)

# 加载预训练模型
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 指定设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # 将模型转移到指定的设备

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 3587/3587 [00:00<00:00, 3744.63 examples/s]
Map: 100%|██████████| 897/897 [00:00<00:00, 3143.14 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:


# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize

In [10]:
from sklearn.utils import resample
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [13]:
import torch
from datasets import Dataset, load_metric
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [14]:
trainer.train()

  f1 = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                        
129it [25:55, 12.05s/it]                          


{'eval_loss': 0.4201464056968689, 'eval_accuracy': 0.8472686733556298, 'eval_f1': 0.8562434417628542, 'eval_precision': 0.8176352705410822, 'eval_recall': 0.8986784140969163, 'eval_runtime': 114.3196, 'eval_samples_per_second': 7.846, 'eval_steps_per_second': 0.988, 'epoch': 1.0}



[A                                               

{'loss': 0.4729, 'grad_norm': 4.8478779792785645, 'learning_rate': 5e-05, 'epoch': 1.11}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                 
                                                  

{'eval_loss': 0.34462088346481323, 'eval_accuracy': 0.8684503901895206, 'eval_f1': 0.865909090909091, 'eval_precision': 0.8943661971830986, 'eval_recall': 0.8392070484581498, 'eval_runtime': 115.4251, 'eval_samples_per_second': 7.771, 'eval_steps_per_second': 0.979, 'epoch': 2.0}



[A                                                

{'loss': 0.4077, 'grad_norm': 9.838132858276367, 'learning_rate': 2.0484061393152303e-05, 'epoch': 2.23}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.

                                                     
100%|██████████| 1347/1347 [1:16:49<00:00,  3.42s/it]

{'eval_loss': 0.43296700716018677, 'eval_accuracy': 0.8795986622073578, 'eval_f1': 0.88, 'eval_precision': 0.8878923766816144, 'eval_recall': 0.8722466960352423, 'eval_runtime': 121.3237, 'eval_samples_per_second': 7.393, 'eval_steps_per_second': 0.931, 'epoch': 3.0}
{'train_runtime': 4609.618, 'train_samples_per_second': 2.334, 'train_steps_per_second': 0.292, 'train_loss': 0.3987255082275572, 'epoch': 3.0}





TrainOutput(global_step=1347, training_loss=0.3987255082275572, metrics={'train_runtime': 4609.618, 'train_samples_per_second': 2.334, 'train_steps_per_second': 0.292, 'train_loss': 0.3987255082275572, 'epoch': 3.0})

In [15]:
import torch
from datasets import Dataset, load_metric
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],
        "precision": precision_score['precision'],
        "recall": recall_score['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 113/113 [00:45<00:00,  2.47it/s]


{'eval_loss': 0.43296700716018677,
 'eval_accuracy': 0.8795986622073578,
 'eval_f1': 0.88,
 'eval_precision': 0.8878923766816144,
 'eval_recall': 0.8722466960352423,
 'eval_runtime': 46.1899,
 'eval_samples_per_second': 19.42,
 'eval_steps_per_second': 2.446}

In [16]:
model_path = "C:/Users/DELL/Downloads/modeltw"  # 请替换为你希望保存模型的实际路径
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('C:/Users/DELL/Downloads/modeltw\\tokenizer_config.json',
 'C:/Users/DELL/Downloads/modeltw\\special_tokens_map.json',
 'C:/Users/DELL/Downloads/modeltw\\vocab.json',
 'C:/Users/DELL/Downloads/modeltw\\merges.txt',
 'C:/Users/DELL/Downloads/modeltw\\added_tokens.json')