In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
def clean_text(text):
    # Convert text to lowercase, remove punctuation and strip extra spaces
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
labelled_data = pd.read_csv('/home/checkworthiness_labeled.csv')

In [5]:
labelled_data.head()

Unnamed: 0,ID,Text,Category
0,16,I think we've seen a deterioration of values.,No
1,17,I think for a while as a nation we condoned th...,No
2,20,"We got away, we got into this feeling that val...",No
3,21,And I don't believe that at all I do believe t...,No
4,22,"And, of course, as far as the how we make it b...",No


In [7]:
labelled_data['Category'].value_counts()


Category
No     7041
Yes    1952
Name: count, dtype: int64

In [8]:
leaderboard_data = pd.read_csv('/home/checkworthiness_leaderboard.csv')

In [9]:
leaderboard_data.head()

Unnamed: 0,ID,Text
0,26,"You know, I saw a movie - ""Crocodile Dundee."""
1,80,We're consuming 50 percent of the world's coca...
2,129,That answer was about as clear as Boston harbor.
3,131,Let me help the governor.
4,172,We've run up more debt in the last eight years...


In [12]:
labelled_data['Text'] = labelled_data['Text'].apply(clean_text)
leaderboard_data['Text'] = leaderboard_data['Text'].apply(clean_text)


In [13]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [14]:
tfidf_vectorizer.fit(labelled_data['Text'])
labelled_features = tfidf_vectorizer.transform(labelled_data['Text'])
leaderboard_features = tfidf_vectorizer.transform(leaderboard_data['Text'])

In [16]:
labelled_features.shape, leaderboard_features.shape

((8993, 5000), (1414, 5000))

In [18]:
category_mapping = {'Yes': 1, 'No': 0}
labelled_data['Category_Binary'] = labelled_data['Category'].map(category_mapping)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(labelled_features, labelled_data['Category_Binary'], test_size=0.2, random_state=42)


In [21]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [23]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.8615897720956087


In [24]:
leaderboard_predictions = model.predict(leaderboard_data)

In [25]:
submission_df = pd.DataFrame({
    'ID': leaderboard_data['ID'],
    'Category': ['Yes' if pred == 1 else 'No' for pred in leaderboard_predictions]
})

In [27]:
submission_df.to_csv('/home/checkworthiness_predictions.csv', index=False)

In [29]:
print(leaderboard_features)

  (0, 4985)	0.3324524854078541
  (0, 3882)	0.8127215937526926
  (0, 2565)	0.47849655797531476
  (1, 4960)	0.27434502529933086
  (1, 4858)	0.1351754727303764
  (1, 4518)	0.0992078899404432
  (1, 3584)	0.2195792202679582
  (1, 3240)	0.30557703796545405
  (1, 3060)	0.13030328662218876
  (1, 1073)	0.5437653272568509
  (1, 941)	0.5437653272568509
  (1, 93)	0.3829979430439856
  (2, 4843)	0.2445293709368198
  (2, 4517)	0.13388964312072094
  (2, 2114)	0.570332108539658
  (2, 922)	0.3953895090034074
  (2, 401)	0.4899610892197814
  (2, 330)	0.3751744359458678
  (2, 130)	0.24464187298225681
  (3, 4518)	0.16833971415457977
  (3, 2810)	0.42881831834381784
  (3, 2648)	0.4579263010575741
  (3, 2165)	0.5447364469027134
  (3, 2033)	0.5304170969215356
  (4, 4979)	0.16967001662747866
  :	:
  (1410, 1937)	0.3607585657869906
  (1410, 1066)	0.4225941212375413
  (1410, 320)	0.11488445265090536
  (1411, 4921)	0.5086920118496974
  (1411, 4917)	0.2837995878401108
  (1411, 4887)	0.27155155466337655
  (1411, 4754