In [1]:
import pandas as pd
df = pd.read_csv('train.csv')
df_val = pd.read_csv('dev.csv')
df = df.dropna()
missing_values = df.isnull().sum()
print(missing_values)
df = df[df['premise']!=df['hypothesis']]
df = df.drop_duplicates(subset=['premise', 'hypothesis'])
df.to_csv('cleaned_train.csv',index = False)
df_val = df_val.dropna()
missing_values_val = df_val.isnull().sum()
print(missing_values)
df_val = df_val[df_val['premise']!=df_val['hypothesis']]
df_val = df_val.drop_duplicates(subset=['premise', 'hypothesis'])
df_val = df_val.dropna()
df_val.to_csv('cleaned_val.csv',index = False)
# preprocessing
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import download

download('stopwords')
download('punkt')


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """对文本进行预处理"""

    text = re.sub(r'https?://\S+|www\.\S+', '', text)


    text = re.sub(r'[^\x00-\x7F]+', ' ', text).lower()


    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)


    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if not w in stop_words]


    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]


    return ' '.join(stemmed_tokens)


df = pd.read_csv('cleaned_train.csv')
df_val = pd.read_csv('cleaned_val.csv')

print(df.columns)
print(df_val.columns)


df['premise'] = df['premise'].apply(preprocess_text)
df['hypothesis'] = df['hypothesis'].apply(preprocess_text)
df_val['premise'] = df_val['premise'].apply(preprocess_text)
df_val['hypothesis'] = df_val['hypothesis'].apply(preprocess_text)


label_data_train = df["label"].tolist()
premise_data_train = df["premise"].tolist()
hypothesis_data_train = df["hypothesis"].tolist()

label_data_val = df_val["label"].tolist()
premise_data_val = df_val["premise"].tolist()
hypothesis_data_val = df_val["hypothesis"].tolist()


premise       0
hypothesis    0
label         0
dtype: int64
premise       0
hypothesis    0
label         0
dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\38673\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\38673\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Index(['premise', 'hypothesis', 'label'], dtype='object')
Index(['premise', 'hypothesis', 'label'], dtype='object')


In [2]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import nltk


nltk.download('wordnet')
nltk.download('omw-1.4')

def find_antonyms(word):
    antonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())
    return set(antonyms)

def antonym_features(sentence1, sentence2):
    words1 = set(word_tokenize(sentence1))
    words2 = set(word_tokenize(sentence2))
    antonyms1 = {ant for word in words1 for ant in find_antonyms(word)}
    antonyms2 = {ant for word in words2 for ant in find_antonyms(word)}
    return len(antonyms1.intersection(antonyms2))


ft_model = fasttext.load_model('cc.en.300.bin')

def pair_sentence_vectors(sentence1, sentence2):
    sentence_vector1 = ft_model.get_sentence_vector(sentence1)
    sentence_vector2 = ft_model.get_sentence_vector(sentence2)


    cos_similarity = cosine_similarity([sentence_vector1], [sentence_vector2])[0, 0]
    abs_diff = np.abs(sentence_vector1 - sentence_vector2)
    elem_mul = sentence_vector1 * sentence_vector2
    len1 = len(word_tokenize(sentence1))
    len2 = len(word_tokenize(sentence2))


    antonym_match_count = antonym_features(sentence1, sentence2)


    combined_vector = np.concatenate((sentence_vector1, sentence_vector2, abs_diff, elem_mul, [cos_similarity, len1, len2, antonym_match_count]))
    
    return combined_vector


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\38673\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\38673\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

# 假设 ft_model 已经加载且可用
def generate_features(premises, hypotheses):
    features = [pair_sentence_vectors(p, h) for p, h in zip(premises, hypotheses)]
    return np.array(features)

# 生成训练和验证数据的特征向量
X_train = generate_features(premise_data_train, hypothesis_data_train)
X_val = generate_features(premise_data_val, hypothesis_data_val)
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 初始化SVM分类器
# svm_classifier = SVC(C=0.1,kernel='sigmoid')  # 你可以选择使用不同的核函数，如 'rbf'
svm_classifier = LinearSVC(C=1,loss = "squared_hinge",dual = False)
svm_classifier.fit(X_train, label_data_train)  # 训练模型

# 使用验证集评估模型
predictions = svm_classifier.predict(X_val)

# 输出分类报告
print(classification_report(label_data_val, predictions))


              precision    recall  f1-score   support

           0       0.62      0.58      0.60      3233
           1       0.63      0.67      0.65      3452

    accuracy                           0.63      6685
   macro avg       0.63      0.63      0.63      6685
weighted avg       0.63      0.63      0.63      6685



In [4]:
df_2 = pd.read_csv('test.csv')
premise_data_test = df_2["premise"].tolist()
hypothesis_data_test = df_2["hypothesis"].tolist()


In [5]:
df_new = pd.DataFrame()
X_test = generate_features(premise_data_test, hypothesis_data_test)

predicted_labels = svm_classifier.predict(X_test)

df_new['prediction'] = predicted_labels

df_new.to_csv('Group_36_A.csv', index=False)

In [6]:
!pip install joblib
from joblib import dump

dump(svm_classifier, 'svm_classifier.joblib')




['svm_classifier.joblib']

Demoes Goes Here 

In [21]:
# load the model 
from joblib import load
svm_classifier = load('svm_classifier.joblib')
sentence_1 = "thanks"
sentence_2 = "thank you"

#load fast text 
ft_model = fasttext.load_model('cc.en.300.bin')

def pair_sentence_vectors(sentence1, sentence2):
    sentence_vector1 = ft_model.get_sentence_vector(sentence1)
    sentence_vector2 = ft_model.get_sentence_vector(sentence2)
    cos_similarity = cosine_similarity([sentence_vector1], [sentence_vector2])[0, 0]
    abs_diff = np.abs(sentence_vector1 - sentence_vector2)
    elem_mul = sentence_vector1 * sentence_vector2
    len1 = len(word_tokenize(sentence1))
    len2 = len(word_tokenize(sentence2))


    antonym_match_count = antonym_features(sentence1, sentence2)

    combined_vector = np.concatenate((
        sentence_vector1, 
        sentence_vector2, 
        abs_diff, 
        elem_mul, 
        [cos_similarity, len1, len2, antonym_match_count]
    ))

    return combined_vector
    
def generate_feature(premise, hypothese):
    features = pair_sentence_vectors(premise,hypothese) 
    return np.array(features)

#predict 
Input_vectors = generate_feature(sentence_1,sentence_2)
prediction = svm_classifier.predict(Input_vectors.reshape(1, -1))



In [22]:
print(prediction)

[1]
