In [None]:
import numpy as np
import csv
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


単語ベクトルの作成

In [None]:
def make_word2id():
    word2id = {} #ディクショナリ word2id = {'単語':単語id}

    with open('drive/My Drive/train.txt', 'r', encoding='utf_8') as f:
        morphemes = [s.strip()[1:] for s in f.readlines()]

    for line in morphemes:
        for word in line.split():
            if word not in word2id:
                word2id[word] = len(word2id)

    

    with open('drive/My Drive/dic.txt', 'w', encoding='utf_8') as f2:
        for word in word2id:
            f2.write('{},{}\n'.format(word, word2id[word]))
    return word2id


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def make_feature(word2id):
    for text_name in ['train', 'test']:

        with open('drive/My Drive/' + text_name + '.txt', 'r', encoding='utf_8') as f:
            morphemes = [s.strip()[1:] for s in f.readlines()]
            word_array = [0]*(len(word2id))
            word_array2 = [] 

            for line in morphemes:
                for word in line.split():
                    if(word in word2id):
                        word_array[word2id[word]]=1
                
                word_array2.append(word_array)
                word_array = [0]*(len(word2id))

        with open('drive/My Drive/' + text_name + '_feature.txt', 'w') as f2:
     
            for line in range(len(word_array2)):
                maped_list=map(str,word_array2[line])
                mojiretu=','.join(maped_list)

                f2.write(mojiretu)
                f2.write('\n')

In [None]:
# 辞書(単語と単語idのリスト)の作成
word2id = make_word2id()

# テキストデータをベクトルに変換
make_feature(word2id)


モデルの学習

In [None]:
def load_data(X_text,y_text):
    # 特徴量の読み込み
    with open(X_text,encoding="utf_8") as f:
        reader = csv.reader(f, delimiter=',')
        X_data = [row for row in reader]

    # カテゴリーIDの読み込み
    with open(y_text,encoding="utf_8") as f:
        reader = csv.reader(f, delimiter='\t')
        y_data = [row[0] for row in reader]

    # str型をfloat型に変換
    for i in range(len(X_data)):
        X_data[i] = [float(n) for n in X_data[i]]

    y_data = [float(n) for n in y_data]

    return X_data,y_data


# 新しいセクション

In [None]:
# 学習データの読み込み
X_train,y_train = load_data('drive/My Drive/train_feature.txt','drive/My Drive/train.txt')

# 評価データの読み込み
X_test,y_test = load_data('drive/My Drive/test_feature.txt','drive/My Drive/test.txt')

In [None]:
# ロジスティック回帰モデルを学習
lr = LogisticRegression(C=0.1) #正則化パラメータを0.1
#lr = LogisticRegression()
lr.fit(X_train,y_train)
joblib.dump(lr, 'drive/My Drive/model.joblib')

['drive/My Drive/model.joblib']

カテゴリ推定

In [None]:
# 学習モデルの読み込み
lf = joblib.load('drive/My Drive/model.joblib')


In [None]:
# 正解率表示
print(f'Accuracy: {accuracy_score(y_train, lf.predict(X_train))}')

##lf.predict(X_test)で推定したカテゴリにアクセス可能
res = lf.predict(X_test)
print(res)
proba = lf.predict_proba(X_train)
print(proba)

#特徴量の重みの確認
# category = ['Business', 'Entertainment', 'Science and Technology', 'Health']
# id2word = {v: k for k, v in word2id.items()}
# for list in range(lf.coef_.shape[0]):
#     for id in id2word:
#         print(f'{category[list]}:{id2word[id]}:{lf.coef_[list][id]}')


#### 出力部分）####

category = ['Business', 'Entertainment', 'Science and Technology', 'Health']
id2word = {v: k for k, v in word2id.items()}
for list in range(lf.coef_.shape[0]):
  largest_index = np.argsort(lf.coef_[list][:])[-11:-1]
  smallest_index = np.argsort(lf.coef_[list][:])[0:9]
  
  print(category[list] + " largest:")
  print([id2word[x] for x in largest_index])
  print([lf.coef_[list][x] for x in largest_index])

  print(category[list] + " smallest:")
  print([id2word[x] for x in smallest_index])
  print([lf.coef_[list][x] for x in smallest_index])


Accuracy: 0.695
[1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1.
 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0.
 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1.
 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1