In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
path = '/content/drive/My Drive/STAT457'
os.chdir(path)
os.listdir()

['data.ipynb',
 'data',
 'trained.ipynb',
 'csvData.csv',
 'tfidf_xgb.csv',
 'Untitled0.ipynb']

In [None]:
!pip install mpld3



In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
import re
import os
import codecs
from sklearn import feature_extraction, model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
import mpld3

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
full = pd.read_csv("data/W22_P2_full.csv",encoding='gb18030')
train = pd.read_csv("data/W22_P2_train.csv",encoding='gb18030')
test = pd.read_csv("data/W22_P2_test.csv",encoding='gb18030')
print(full.head(5))

     id                                        description
0  9014   Docu-drama inspired by the life of Bartolomeo...
1  2923   Jos鑼?is a passionate young man in the Dominic...
2   313   Imposing, austere gray buildings dominate a s...
3  8292   In the multiplex era, a few days prior to his...
4  4131   Edmund Purdom narrates a pseudo-documentary a...


In [None]:
train_y = train[["genre"]]
train_x = train[["description"]]
description = full[["description"]]
test_x = test[["description"]]
test_id = test[["id"]]

In [None]:
train_y = np.array(train_y).tolist()
train_x = np.array(train_x).tolist()
description = np.array(description).tolist()
test_x = np.array(test_x).tolist()
train_y = [i[0] for i in train_y]
train_x = [i[0] for i in train_x]
description = [i[0] for i in description]
test_x = [i[0] for i in test_x]
test_id = [int(test_id.loc[i]) for i in range(len(test_id))]

In [None]:
print(test_x[0])

 According to the text of St鑼卲hane E.Roy Nine slices of life. Nine stories that intertwine. A satirical comedy. Marc Gauthier, creator of the new "Dare Communic-Action 鑹? alleged communication guru offers a new approach. But there will always be a gap between theory and practice ... Between nine earthy situations and absurd misunderstandings everyone will try to grow up"


In [None]:
# load nltk's English stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])
# load nltk's SnowballStemmer as variabled 'stemmer'
stemmer = SnowballStemmer("english")

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [None]:
# *tokenize_and_stem*: tokenizes (splits the synopsis into a list of its respective words (or tokens) and also stems each token
# *tokenize_only*: tokenizes the synopsis only
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
description[0]

' Docu-drama inspired by the life of Bartolomeo Scappi, the personal chef of the 16th century Pontiff, Pope Pius V. Film looks at the cooking implements, ingredients and recipes used by Scappi, who has been called 鎵?the Michelangelo of the kitchen. 閽?\n2909, short , Unseen in the background is fate and it\'s about to start a journey with Officer Trevor Lewis who is dealing with the tragic loss of his son. The grief he feels haunts his soul and yet somehow a small musical snow globe is all that is needed to begin the healing. A journey good deeds and a prayer all come together in a story of hope."'

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []

for i in description:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
test_stemmed = []
test_tokenized = []
for i in test_x:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    test_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    test_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

In [None]:
vocab_frame

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 0.01,stop_words='english',use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(description) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

  % sorted(inconsistent)


(15998, 904)


In [None]:
print(len(train_x))

9998


In [None]:
print(tfidf_matrix[0:9998,].toarray())

[[0.07268798 0.         0.         ... 0.         0.         0.        ]
 [0.06325744 0.         0.         ... 0.         0.         0.        ]
 [0.08012092 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.06954037 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
print(tfidf_matrix.toarray())
tfidf_vectorizer.get_feature_names()

[[0.07268798 0.         0.         ... 0.         0.         0.        ]
 [0.06325744 0.         0.         ... 0.         0.         0.        ]
 [0.08012092 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]




["'s",
 "'s life",
 'abandon',
 'abl',
 'abus',
 'accept',
 'accid',
 'accompani',
 'achiev',
 'act',
 'action',
 'activ',
 'actor',
 'actress',
 'actual',
 'addict',
 'adopt',
 'adult',
 'adventur',
 'affair',
 'affect',
 'age',
 'agent',
 'ago',
 'agre',
 'aid',
 'air',
 'alcohol',
 'allow',
 'alon',
 'alreadi',
 'alway',
 'america',
 'american',
 'ancient',
 'angel',
 'ani',
 'anim',
 'anoth',
 'answer',
 'anyon',
 'anyth',
 'apart',
 'appear',
 'approach',
 'archiv',
 'area',
 'arm',
 'armi',
 'arrest',
 'arriv',
 'art',
 'artist',
 'ask',
 'assist',
 'attack',
 'attempt',
 'attend',
 'attent',
 'attract',
 'audienc',
 'author',
 'award',
 'away',
 'babi',
 'background',
 'bad',
 'band',
 'bank',
 'bar',
 'base',
 'battl',
 'beauti',
 'becam',
 'becaus',
 'becom',
 'befor',
 'began',
 'begin',
 'believ',
 'best',
 'best friend',
 'better',
 'big',
 'biggest',
 'birth',
 'black',
 'bodi',
 'bond',
 'book',
 'born',
 'boss',
 'boy',
 'boyfriend',
 'break',
 'bring',
 'british',
 'bro

In [None]:
import xgboost
from xgboost import XGBClassifier

In [None]:
encoder = preprocessing.LabelEncoder()
t_y = encoder.fit_transform(train_y)

In [None]:
print(genre[0:10])

In [None]:
train_y[0:10]

In [None]:
def test_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    return predictions

In [None]:
tfidf_train_x = tfidf_matrix[0:9998,:]
tfidf_test_x = tfidf_matrix[9998:,:]
print(tfidf_train_x.shape)
print(tfidf_test_x.shape)

(9998, 904)
(6000, 904)


In [None]:
type(tfidf_test_x)

scipy.sparse.csr.csr_matrix

In [None]:
pred = test_model(xgboost.XGBClassifier(), tfidf_train_x, train_y, tfidf_test_x)

In [None]:
print(pred)

[' comedy ' ' documentary ' ' documentary ' ... ' documentary ' ' short '
 ' comedy ']


In [None]:
pred1 = list(pred.copy())
pred1 = [" documentary " if i ==1 else i for i in pred1]
pred1 = [" short " if i ==3 else i for i in pred1]
pred1 = [" comedy " if i ==0 else i for i in pred1]
pred1 = [" drama " if i ==2 else i for i in pred1]

In [None]:
pred1

In [None]:
import csv
csvFile = open("tfidf_xgb.csv", "w")  
writer = csv.writer(csvFile)
#先写入columns_name
writer.writerow(["id", "genre"])
for i in range(len(pred)):
    writer.writerow([test_id[i], pred[i]])
csvFile.close()

In [None]:
np.savetxt('tfidf.csv', tfidf_matrix.toarray(), delimiter = ',')