In [130]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alchemy/unlabelled.csv
/kaggle/input/alchemy/11_word_set.csv
/kaggle/input/alchemy/new_11_word_set.csv
/kaggle/input/alchemy/trainingset.csv
/kaggle/input/alchemy/compressed_documents_2.csv.gzip
/kaggle/input/alchemy/all_fragments.csv
/kaggle/input/alchemy/compressed_documents.csv.gzip


In [131]:
# Reads documents
df = pd.read_csv('/kaggle/input/alchemy/new_11_word_set.csv')
unlabelled_df = pd.read_csv('/kaggle/input/alchemy/unlabelled.csv')
complete_df = pd.read_csv('/kaggle/input/alchemy/all_fragments.csv')

In [132]:
from sklearn.model_selection import train_test_split

# Uses the sentence "Fragments" as independent variable
sentences = df['Fragments'].values

# Set dependent variable here, "Principles" or "Lab/Ops"
Y = df['Principles'].values

# Splits datasets into 20% test data, 80% train data
sentences_train, sentences_test, Y_train, Y_test = train_test_split(sentences, Y, test_size = .2, random_state = 39)

In [133]:
# Imports vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Creates a TF-IDF vectorizer and fits it onto the training sentences
vectorizer = TfidfVectorizer()
vectorizer.fit(sentences_train)

# Creates new independent variables that are transformed with vectorizer
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

In [134]:
# Imports classifier model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Fits classifier onto training data
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

# Creates score based on test data
score = classifier.score(X_test, Y_test)
print("Accuracy:", score)

Accuracy: 0.8977272727272727


In [135]:
# Creates confusion matrix and classification report using predictions for test data
from sklearn.metrics import confusion_matrix, classification_report
Y_pred = classifier.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[74  1]
 [ 8  5]]
              precision    recall  f1-score   support

           0       0.90      0.99      0.94        75
           1       0.83      0.38      0.53        13

    accuracy                           0.90        88
   macro avg       0.87      0.69      0.73        88
weighted avg       0.89      0.90      0.88        88



In [136]:
# Cross validation score
from sklearn.model_selection import cross_val_score
from statistics import mean
print(mean(cross_val_score(classifier, X_train, Y_train, cv=5)))

0.7873291925465838


In [137]:
# Creates predictions for all the fragments in the dataset
all_fragments = complete_df['Fragments']
predictions = []
fragments_count = []

# Loops through all the fragments and predicts if they belong to the category or not. 
# The category (Principles or Lab/Ops) depends on the variable used at the beginning, the value of variable "Y".
for i in all_fragments:
    fragments_list = i.strip('][').split(', ')
    
    unlabelled_X = vectorizer.transform(fragments_list)
    unlabelled_predictions = classifier.predict(unlabelled_X)
    predictions.append(np.count_nonzero(unlabelled_predictions == 1))
    fragments_count.append(len(fragments_list))

# Saves these predictions to a csv.
complete_df['Predictions'] = fragments_count
complete_df['Principles Predictions'] = predictions
complete_df = complete_df.drop(['Unnamed: 0', 'Fragments'], axis=1)
complete_df
complete_df.to_csv('principles_predictions.csv')