# Imports

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

# Parameters

In [2]:
train_data_path = 'train_data_challenge.csv'
test_data_path = 'test_data_challenge.csv'

feature_col = 'ArticleTitle'

# Load Train & Test data

In [3]:
train_data = pd.read_csv(train_data_path, index_col=0)
train_data

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy
1,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,No,easy,hard
2,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy
3,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Yes,easy,easy
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy
...,...,...,...,...,...
3992,Woodrow_Wilson,What lived in Columbia?,Wilson,,too hard
3993,Woodrow_Wilson,Was Wilson president of the American Political...,Yes,,easy
3995,Woodrow_Wilson,Did Wilson not spend 1914 through the beginnin...,Yes,,easy
3996,Woodrow_Wilson,"Was Wilson , a staunch opponent of antisemitis...",Yes,,easy


In [4]:
test_data = pd.read_csv(test_data_path, index_col=0)
test_data

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
6,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium
14,Alessandro_Volta,When did Alessandro Volta improve and popular...,Alessandro Volta improved and popularized the ...,hard,hard
26,Alessandro_Volta,Where was Volta born?,Como,medium,medium
30,Alessandro_Volta,A year before improving and popularizing the e...,A professor of physics at the Royal School in ...,hard,hard
46,Amedeo_Avogadro,Was he a member of the Royal Superior Council ...,Yes,easy,easy
...,...,...,...,...,...
3938,Uruguay,Is Uruguay the smallest soverign nation in Sou...,No,hard,hard
3957,Woodrow_Wilson,Did the U.S. join the League of Nations?,No,easy,medium
3962,Woodrow_Wilson,Who was President when Wilson finished Congres...,Grover Cleveland,medium,medium
3969,Woodrow_Wilson,Was Wilson a member of the Phi Kappa Psi frate...,yes,easy,easy


## Summaries

In [5]:
train_data.describe()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
count,3642,3607,3121,2768,3119
unique,109,2357,1682,4,5
top,Amedeo_Avogadro,Was King Victor Emmanuel III there to pay homa...,Yes,medium,easy
freq,119,5,448,947,1240


In [6]:
train_data[['ArticleTitle', 'Question', 'Answer']].dropna().describe()

Unnamed: 0,ArticleTitle,Question,Answer
count,3121,3121,3121
unique,107,2109,1682
top,Amedeo_Avogadro,What do river otters eat?,Yes
freq,78,4,448


In [7]:
test_data.describe()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
count,356,354,301,275,299
unique,107,342,179,3,5
top,Amedeo_Avogadro,Was he a member of the Royal Superior Council ...,Yes,hard,medium
freq,13,2,44,96,114


In [8]:
test_data[['ArticleTitle', 'Question', 'Answer']].dropna().describe()

Unnamed: 0,ArticleTitle,Question,Answer
count,301,301,301
unique,101,290,179
top,Alessandro_Volta,Does Portuguese contain words from the Arabic ...,Yes
freq,9,2,44


## Split features

In [5]:
X_train = train_data[[x for x in train_data.columns if x != feature_col]]
X_train.head()

Unnamed: 0,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
0,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy
1,Was Alessandro Volta a professor of chemistry?,No,easy,hard
2,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy
3,Did Alessandro Volta invent the remotely opera...,Yes,easy,easy
4,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy


In [6]:
y_train = train_data[feature_col]
y_train.head()

0    Alessandro_Volta
1    Alessandro_Volta
2    Alessandro_Volta
3    Alessandro_Volta
4    Alessandro_Volta
Name: ArticleTitle, dtype: object

In [7]:
X_test = test_data[[x for x in test_data.columns if x != feature_col]]
X_test.head()

Unnamed: 0,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer
6,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium
14,When did Alessandro Volta improve and popular...,Alessandro Volta improved and popularized the ...,hard,hard
26,Where was Volta born?,Como,medium,medium
30,A year before improving and popularizing the e...,A professor of physics at the Royal School in ...,hard,hard
46,Was he a member of the Royal Superior Council ...,Yes,easy,easy


In [8]:
y_test = test_data[feature_col]
y_test.head()

6     Alessandro_Volta
14    Alessandro_Volta
26    Alessandro_Volta
30    Alessandro_Volta
46     Amedeo_Avogadro
Name: ArticleTitle, dtype: object

In [5]:
def split_feature_target(train_data, test_data):
    
    X_train = train_data[[x for x in train_data.columns if x != feature_col]]
    y_train = train_data[feature_col]
    X_test = test_data[[x for x in test_data.columns if x != feature_col]]
    y_test = test_data[feature_col]
    
    return X_train, y_train, X_test, y_test

# Feature Selection

In [6]:
def featSel_Q(train_data, test_data):
    
    # Remove nulls
    train_data = train_data[['Question', feature_col]].dropna()
    test_data = test_data[['Question', feature_col]].dropna()
    
    # Split between features and target
    X_train, y_train, X_test, y_test = split_feature_target(train_data, test_data)
    
    # Use only question
    X_train = X_train['Question']
    X_test = X_test['Question']
    
    return X_train, y_train, X_test, y_test

In [7]:
def featSel_QA(train_data, test_data):
    
    # Remove nulls
    train_data = train_data[['Question','Answer', feature_col]].dropna()
    test_data = test_data[['Question','Answer', feature_col]].dropna()
    
    # Split between features and target
    X_train, y_train, X_test, y_test = split_feature_target(train_data, test_data)
    
    # Combine Question and Answer into 1 sentence
    X_train = X_train['Question'] + " " + X_train['Answer']
    X_test = X_test['Question'] + " " + X_test['Answer']
    
    return X_train, y_train, X_test, y_test

In [8]:
def featSel_QA_splitArticlTitle(train_data, test_data):
    
    # Remove nulls
    train_data = train_data[['Question','Answer', feature_col]].dropna()
    test_data = test_data[['Question','Answer', feature_col]].dropna()
    
    # Split between features and target
    X_train, y_train, X_test, y_test = split_feature_target(train_data, test_data)
    
    # Combine Question and Answer into 1 sentence
    X_train = X_train['Question'] + " " + X_train['Answer']
    X_test = X_test['Question'] + " " + X_test['Answer']
    
    # replace '_' with ' ', so that tokenization works better
    y_train = y_train.str.replace("_", " ")
    y_test = y_test.str.replace("_", " ")
    
    return X_train, y_train, X_test, y_test

# Pre-Processing

In [9]:
def preprocess_1(X_train, y_train, X_test, y_test, lowercase=True, ngram_range=(1,1)):
    
    # scikit-learn module to tokenize, counts term frequencies and normalize text data
    vectorizer = TfidfVectorizer(lowercase=lowercase, ngram_range=ngram_range)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    return X_train_vec, X_test_vec

In [10]:
def preprocess_2(X_train, y_train, X_test, y_test, lowercase=True, ngram_range=(1,1)):
    
    # scikit-learn module to tokenize, counts term frequencies and normalize text data
    vectorizer = CountVectorizer(lowercase=lowercase, ngram_range=ngram_range)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    return X_train_vec, X_test_vec

# Model

In [11]:
def model_LogReg(X_train_vec, y_train, X_test_vec, y_test, accuracy=True, C=1.0, class_weight=None):
    
    #Scikit learn logistic regression classifier
    model = LogisticRegression(C=C, class_weight=class_weight)
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)
    if accuracy:
        print("Accuracy:", accuracy_score(y_test, y_pred))
    
    return model, y_pred

In [12]:
def model_NaiveBayes(X_train_vec, y_train, X_test_vec, y_test, accuracy=True):
    
    #Scikit learn logistic naive bayes classifier
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)
    if accuracy:
        print("Accuracy:", accuracy_score(y_test, y_pred))
    
    return model, y_pred

# Tests

In [21]:
# Using only questions
X_train, y_train, X_test, y_test = featSel_Q(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test)
model,_ = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.8135593220338984


In [22]:
# Using questions and answers
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test)
model,_ = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.8604651162790697


In [23]:
# Using questions and answers and Uppercase
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test, lowercase=False)
model,_ = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.8471760797342193


Setting lower case to false means that the words are not converted to lowercase before tokenization. The reasoning behind was that maybe it would better get article titles with person names or acronyms, but seems that it only confuses more the classifier.

In [24]:
# Using questions and answers and considering also bigrams
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test, ngram_range=(1,2))
model,_ = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.8438538205980066


The idea was that many of the article titles contain 2 words, so would be easy for the algorithm to predict the title if it also took into consideration context (not only isolated words). Nevertheless, as in the lowercase tentative, the performance does not increase possibly because the training set is not very big and both approaches make the feature vectors bigger.

In [25]:
# Using questions and answers now with Naive Bayes
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test)
model,_ = model_NaiveBayes(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.6710963455149501


In [26]:
# Using questions and answers now with Naive Bayes and considering also bigrams
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_1(X_train, y_train, X_test, y_test, ngram_range=(1,2))
model,_ = model_NaiveBayes(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.6777408637873754


In [27]:
# Using questions and answers now with Naive Bayes and considering also bigrams (with counts instead of tf-idf)
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test, ngram_range=(1,1))
model,_ = model_NaiveBayes(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.7242524916943521


Naive Bayes can perform better with word counts instead of tf-idf in some cases, it's possible that this dataset since it's questions and answers (small texts) do not require the inverse document frequency normalization. Let's then select the best ngram_range with a validation set

In [28]:
# Using questions and answers now with Naive Bayes choosing ngram with validation set (with counts instead of tf-idf)
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)

# Split the training set into train and valitdation
X_train_new, X_val, y_train_new, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_ngram_range = None
best_accuracy = 0.0

for n in range(1, 7):  # Let's try with 1 to 6
    # Vectorize train and val set
    X_train_vec, X_val_vec = preprocess_2(X_train_new, y_train_new, X_val, y_val, ngram_range=(1,n))
    
    model, y_pred = model_NaiveBayes(X_train_vec, y_train_new, X_val_vec, y_val, accuracy=False)
    
    y_val_pred = model.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    print(f"Validation Accuracy for ngram_range=(1,{n}):", accuracy)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ngram_range = (1, n)
        
print("Best ngram_range:", best_ngram_range)
# Train now on the entire training set with the best parameters and test on the test set
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test, ngram_range=best_ngram_range)
model,_ = model_NaiveBayes(X_train_vec, y_train, X_test_vec, y_test)

Validation Accuracy for ngram_range=(1,1): 0.6768
Validation Accuracy for ngram_range=(1,2): 0.7312
Validation Accuracy for ngram_range=(1,3): 0.7424
Validation Accuracy for ngram_range=(1,4): 0.7456
Validation Accuracy for ngram_range=(1,5): 0.7408
Validation Accuracy for ngram_range=(1,6): 0.736
Best ngram_range: (1, 4)
Accuracy: 0.8106312292358804


We now got an accuracy >80% with naive bayes. Also to note that the accuracy on the validation set is much lower than on the test set which might indicate that with more samples we could improve even more the model.
Naive Bayes is still performing worse than our previous result with logistic regression, but since counting tokens instead of the if-idf approach improves so much the bayes accuracy, let's try the same for the logistic regression

In [29]:
# Using questions and answers with Logistic Regression choosing ngram with validation set (with counts instead of tf-idf)
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)

# Split the training set into train and valitdation
X_train_new, X_val, y_train_new, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_ngram_range = None
best_accuracy = 0.0

for n in range(1, 5):  # Let's try with 1 to 4
    # Vectorize train and val set
    X_train_vec, X_val_vec = preprocess_2(X_train_new, y_train_new, X_val, y_val, ngram_range=(1,n))
    
    model, y_pred = model_LogReg(X_train_vec, y_train_new, X_val_vec, y_val, accuracy=False)
    
    y_val_pred = model.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    print(f"Validation Accuracy for ngram_range=(1,{n}):", accuracy)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_ngram_range = (1, n)
        
print("Best ngram_range:", best_ngram_range)
# Train now on the entire training set with the best parameters and test on the test set
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test, ngram_range=best_ngram_range)
model,_ = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Validation Accuracy for ngram_range=(1,1): 0.8528
Validation Accuracy for ngram_range=(1,2): 0.8528
Validation Accuracy for ngram_range=(1,3): 0.8416
Validation Accuracy for ngram_range=(1,4): 0.8368
Best ngram_range: (1, 1)
Accuracy: 0.8870431893687708


## Best model tests

The idea here is to try to understand the misses

In [14]:
# Using questions and answers
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test)
best_model, y_pred = model_LogReg(X_train_vec, y_train, X_test_vec, y_test)

Accuracy: 0.8870431893687708


In [31]:
# boolean array that is True when prediction doesnt match ground truth
difference = y_test != y_pred

# Create df with misses (to try to understand them)
test_data_misses = test_data[test_data.index.isin(difference[difference].index)]
test_data_misses['Prediction'] = y_pred[difference]
test_data_misses.to_csv('test_data_misses.csv')
test_data_misses.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_misses['Prediction'] = y_pred[difference]


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,Prediction
46,Amedeo_Avogadro,Was he a member of the Royal Superior Council ...,Yes,easy,easy,Michael_Faraday
47,Amedeo_Avogadro,Was he a member of the Royal Superior Council ...,"Yes, Avogadro was a member of the Royal Super...",easy,easy,Michael_Faraday
117,Ant,Are termites actually more closely related to ...,"Yes, termites are actually more closely relate...",,easy,Dragonfly
192,Berlin,Does the Gendarmenmarkt border the French Cath...,"Yes, the Gendarmenmarkt borders the French Cat...",easy,medium,French_language
627,Jackson_Pollock,Give an example of the origins of the term act...,Pollock's technique of pouring and dripping pa...,,medium,Cymbal


Differenciating between question and answer probably would have some impact, since the algorithm right now can make no sense of the answer "yes" or "no" for example. Also, the accuracy is already considerable with the amount of samples we have, would also be good to be able to use n_grams or other forms of giving context to words but that would require a bigger dataset since the feature vector becomes larger.

## Cross Validation Tests

In [15]:
# using the best model so far
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test)

# predict train & test labels
y_train_pred = best_model.predict(X_train_vec)
y_test_pred = best_model.predict(X_test_vec)

# compute accuracies
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Train Accuracy: 0.9990387696251202
Test Accuracy: 0.8870431893687708


This means that we might be overfitting the data, let's then play with the regularization parameter

In [43]:
# Using questions and answers with Logistic Regression choosing ngram with validation set (with counts instead of tf-idf)
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
    
# pipeline tokenization & model to apply cross-validation
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression())
])

# cross-validation with grid of parameters
param_grid =  {'vectorizer__ngram_range':[(1,1),(1,2)],
               'classifier__C':[1, 0.1, 0.01],
               'classifier__class_weight':[None, 'balanced']}

model = GridSearchCV(pipeline, param_grid, cv=4, verbose=4)
model.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits




[CV 1/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 1); total time=   4.5s
[CV 2/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 1); total time=   3.9s
[CV 3/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 1); total time=   4.0s
[CV 4/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 1); total time=   3.7s
[CV 1/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 2); total time=  17.0s
[CV 2/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 2); total time=  15.3s
[CV 3/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 2); total time=  15.6s
[CV 4/4] END classifier__C=1, classifier__class_weight=None, vectorizer__ngram_range=(1, 2); total time=  13.3s
[CV 1/4] END classifier__C=1, classifier__class_weight=balanced, vectorizer__ngram_range=(1, 1); total t

GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('vectorizer', CountVectorizer()),
                                       ('classifier', LogisticRegression())]),
             param_grid={'classifier__C': [1, 0.1, 0.01],
                         'classifier__class_weight': [None, 'balanced'],
                         'vectorizer__ngram_range': [(1, 1), (1, 2)]},
             verbose=4)

In [44]:
model.best_params_

{'classifier__C': 1,
 'classifier__class_weight': 'balanced',
 'vectorizer__ngram_range': (1, 1)}

In [45]:
model.best_score_

0.752957254013592

Here although we seem to be overfitting the training data, increasing the regularization parameter does not yeld better results. Still, balancing the weights of each class (giving stronger weights to less seen classes) increase the performance, which makes sense since we have some classes that have very few samples but we still want to be able to predict them (works a bit as a gradient boosting). Also, since we have a lot of classes and some have very few examples, cross-validation is not as reliable as if we had fewer classes with more examples

In [46]:
# Using questions and answers
X_train, y_train, X_test, y_test = featSel_QA(train_data, test_data)
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test)
model, y_pred = model_LogReg(X_train_vec, y_train, X_test_vec, y_test, C=1, class_weight='balanced', accuracy=False)

# predict train & test labels
y_train_pred = model.predict(X_train_vec)

# compute accuracies
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Train Accuracy: 0.9971163088753605
Test Accuracy: 0.9036544850498339


# Changing Article Titles

In [18]:
test_data['ArticleTitle'].drop_duplicates().to_list()

['Alessandro_Volta',
 'Amedeo_Avogadro',
 'Ant',
 'Antwerp',
 'Arabic_language',
 'Berlin',
 'Blaise_Pascal',
 'Butterfly',
 'Charles-Augustin_de_Coulomb',
 'Chinese_language',
 'Cougar',
 'Cymbal',
 'Dragonfly',
 'Drum',
 'Eel',
 'Finnish_language',
 'Flute',
 'Giant_Panda',
 'Guitar',
 'Henri_Becquerel',
 'Isaac_Newton',
 'Jackson_Pollock',
 'Jakarta',
 'James_Watt',
 'Korean_language',
 'Kuala_Lumpur',
 'Leonardo_da_Vinci',
 'Lobster',
 'Lyre',
 'Malay_language',
 'Melbourne',
 'Michael_Faraday',
 'Michelangelo',
 'Montreal',
 'Nairobi',
 'Nikola_Tesla',
 'Norman_Rockwell',
 'Octopus',
 'Pablo_Picasso',
 'Piano',
 'Pierre-Auguste_Renoir',
 'Portuguese_language',
 'Saint_Petersburg',
 'San_Francisco',
 'Swahili_language',
 'Swedish_language',
 'Taipei',
 'Trumpet',
 'Turkish_language',
 'Vietnamese_language',
 'Vincent_van_Gogh',
 'Violin',
 'Xylophone',
 'Zebra',
 'Anders_Celsius',
 'Bee',
 'Beijing',
 'Cello',
 'Copenhagen',
 'Dhaka',
 'English_language',
 'Fox',
 'French_language'

It seems that article titles are separated by "_", thus it's probably easier for the algorithm if we split it into actual words, this way the target vector will be smaller and more related to the feature vector

In [49]:
# Using the best model so far
X_train, y_train, X_test, y_test = featSel_QA_splitArticlTitle(train_data, test_data)
X_train_vec, X_test_vec = preprocess_2(X_train, y_train, X_test, y_test)
model, y_pred = model_LogReg(X_train_vec, y_train, X_test_vec, y_test, class_weight='balanced', accuracy=False)

# predict train & test labels
y_train_pred = model.predict(X_train_vec)

# compute accuracies
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Train Accuracy: 0.9971163088753605
Test Accuracy: 0.9036544850498339


It doesn't seem to improve our results probably because we were already achieving a very good performance and we are not taking word context into consideration, but still let's continue using this approach of removing the underscore since it might be useful if we try other approaches where context (RNNs) and word meaning (word embbedings) are taken into account