In [3]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle
from nltk.corpus import stopwords

In [4]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mktmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load dataset
df = pd.read_csv('training_data_lowercase.csv', sep='\t', names=['label', 'text'])
# Print dataset
df


Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...
...,...,...
34147,1,tears in rain as thais gather for late king's ...
34148,1,pyongyang university needs non-u.s. teachers a...
34149,1,philippine president duterte to visit japan ah...
34150,1,japan's abe may have won election\tbut many do...


In [6]:
# Check and clean dataset
df.dropna(subset=['text', 'label'], inplace=True)

In [7]:
# Preprocess text (lowercasing, removing punctuation, and stopwords)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    stop_words = set(stopwords.words('english'))  # Using English stopwords
    return ' '.join([word for word in text.split() if word not in stop_words])

df['cleaned_text'] = df['text'].apply(preprocess_text)


In [8]:
# Remove rows with empty 'cleaned_text' after preprocessing
df = df[df['cleaned_text'].str.strip() != '']

In [9]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred = logreg_model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the Logistic Regression model and the TF-IDF vectorizer for future use
with open('logreg_model.pkl', 'wb') as f:
    pickle.dump(logreg_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)




Accuracy: 0.9027676087274857
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      3497
           1       0.90      0.91      0.90      3332

    accuracy                           0.90      6829
   macro avg       0.90      0.90      0.90      6829
weighted avg       0.90      0.90      0.90      6829



In [12]:
# Display a few actual vs predicted examples
for i in range(5):
    print(f"Text: {df['text'].iloc[i]}")
    print(f"Actual Label: {y_test.iloc[i]} | Predicted Label: {y_pred[i]}")
    print("-" * 50)

Text: donald trump sends out embarrassing new year‚s eve message; this is disturbing
Actual Label: 0 | Predicted Label: 0
--------------------------------------------------
Text: drunk bragging trump staffer started russian collusion investigation
Actual Label: 0 | Predicted Label: 0
--------------------------------------------------
Text: sheriff david clarke becomes an internet joke for threatening to poke people ‚in the eye‚
Actual Label: 1 | Predicted Label: 1
--------------------------------------------------
Text: trump is so obsessed he even has obama‚s name coded into his website (images)
Actual Label: 1 | Predicted Label: 1
--------------------------------------------------
Text: pope francis just called out donald trump during his christmas speech
Actual Label: 1 | Predicted Label: 1
--------------------------------------------------


In [13]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

# Initialize and train Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Get the importance of words from the model's coefficients
top_n = 10  # Number of top words to display
sorted_items = logreg_model.coef_[0].argsort()  # Sort the coefficients by importance

# Get the feature names (words)
feature_names = tfidf.get_feature_names_out()

# Display top words for class 0 (e.g., machine-translated sentences)
print(f"Top {top_n} words for Machine-translated sentences:")
for i in sorted_items[:top_n]:
    print(f"{feature_names[i]}")

# Display top words for class 1 (e.g., human-translated sentences)
print(f"\nTop {top_n} words for Human-translated sentences:")
for i in sorted_items[-top_n:]:
    print(f"{feature_names[i]}")



Top 10 words for Machine-translated sentences:
video
breaking
gop
hillary
hillarys
racist
bernie
wow
muslim
tweets

Top 10 words for Human-translated sentences:
pm
china
urges
house
probe
seeks
talks
us
factbox
says
