# 1.1. コードリーディング/コードドキュメンテーション

In [None]:
import os
import numpy as np
from PIL import Image
import sqlite3
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

INCLUDED_EXTENTION = [".png", ".jpg"]

dbname = 'images.db'
conn = sqlite3.connect(dbname)
cur = conn.cursor()
cur.execute('DROP TABLE image_info')
cur.execute('CREATE TABLE image_info (id INTEGER PRIMARY KEY AUTOINCREMENT, filename STRING)')
conn.commit()
conn.close()

conn = sqlite3.connect(dbname)
cur = conn.cursor()
filenames = sorted(os.listdir('handwriting_pics'))
for filename in filenames:
    base, ext = os.path.splitext(filename)
    if ext not in INCLUDED_EXTENTION:
        continue
    cur.execute('INSERT INTO image_info(filename) values(?)', (filename,))
conn.commit()
cur.close()
conn.close()

conn = sqlite3.connect(dbname)
cur = conn.cursor()
cur.execute('SELECT * FROM image_info')
pics_info = cur.fetchall()
cur.close()
conn.close()

img_test = np.empty((0, 64))
for pic_info in pics_info:
    filename = pic_info[1]
    base, ext = os.path.splitext(filename)
    if ext not in INCLUDED_EXTENTION:
        continue
    img = Image.open(f'handwriting_pics/{filename}').convert('L')
    img_data256 = 255 - np.array(img.resize((8, 8)))

    min_bright = img_data256.min()
    max_bright = img_data256.max()
    img_data16 = (img_data256 - min_bright) / (max_bright - min_bright) * 16
    img_test = np.r_[img_test, img_data16.astype(np.uint8).reshape(1, -1)]

digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
logreg = LogisticRegression(max_iter=2000)
logreg_model = logreg.fit(X_train, y_train)

X_true = []
for filename in filenames:
    base, ext = os.path.splitext(filename)
    if ext not in INCLUDED_EXTENTION:
        continue
    X_true = X_true + [int(filename[:1])]
X_true = np.array(X_true)
pred_logreg = logreg_model.predict(img_test)

print('手書き文字の判別結果')
print('観測結果:', X_true)
print('予測結果:', pred_logreg)
print('正解率:', logreg_model.score(img_test, X_true))

### データへのアクセスするコード

In [None]:
INCLUDED_EXTENTION = [".png", ".jpg"]

# 画像の入っているフォルダを指定し、中身のファイル名を取得
# images.dbを新規作成。images.dbがすでに存在していれば、接続。
dbname = 'images.db'
# データベースへのコネクションオブジェクト作成
conn = sqlite3.connect(dbname)
# sqliteを操作するカーソルオブジェクトを作成
cur = conn.cursor()
# データベースの初期化
cur.execute('DROP TABLE image_info')
# image_infoというtableを作成。
cur.execute('CREATE TABLE image_info (id INTEGER PRIMARY KEY AUTOINCREMENT, filename STRING)')
# データベースへコミットし、変更を保存
conn.commit()
conn.close()

# データベースに画像のファイル名を挿入
conn = sqlite3.connect(dbname)
cur = conn.cursor()
filenames = sorted(os.listdir('handwriting_pics'))
for filename in filenames:
    base, ext = os.path.splitext(filename)
    if ext not in INCLUDED_EXTENTION:
        continue
    cur.execute('INSERT INTO image_info(filename) values(?)', (filename,))
conn.commit()
cur.close()
conn.close()

# tableの中身を取得
conn = sqlite3.connect(dbname)
cur = conn.cursor()
cur.execute('SELECT * FROM image_info')
# fetchall()を使って中身を全て取得
pics_info = cur.fetchall()
cur.close()
conn.close()

### データを学習/予測/計算するコード

In [None]:
# 教師データからの学習
# sklearnのデータセットから取得、目的変数Xと説明変数yに分ける
digits = load_digits()
X = digits.data
y = digits.target
#教師データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
#ロジスティック回帰のモデルの作成し、教師データを使って学習させる。
logreg = LogisticRegression(max_iter=2000)
logreg_model = logreg.fit(X_train, y_train)

# 画像データの判別
# 画像データの正解を配列にします。
X_true = []
for filename in filenames:
    base, ext = os.path.splitext(filename)
    if ext not in INCLUDED_EXTENTION:
        continue
    X_true = X_true + [int(filename[:1])]
X_true = np.array(X_true)

#ロジスティック回帰の学習済みモデルに画像データを入れ、判別します。
pred_logreg = logreg_model.predict(img_test)

print('手書き文字の判別結果')
print('観測結果:', X_true)
print('予測結果:', pred_logreg)
print('正解率:', logreg_model.score(img_test, X_true))

# 1.2. モジュール分割 / 関数分割

### データへのアクセスするコード

In [None]:
import os
import numpy as np
from PIL import Image
import sqlite3
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
INCLUDED_EXTENTION = [".png", ".jpg"]
dbname = 'images.db'
dir_name = 'handwriting_pics'

def load_filenames(dir_name, included_ext=INCLUDED_EXTENTION):
    """手書き文字画像が置いてあるパスからファイル名を取得し、リストを作成"""
    files = []
    filenames = sorted(os.listdir(dir_name))
    for filename in filenames:
        base, ext = os.path.splitext(filename)
        if ext not in included_ext:
            continue
        files.append(filename)
    return files

def create_table(dbname):
    """テーブルを作成する関数"""
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    cur.execute('DROP TABLE image_info')
    cur.execute( 'CREATE TABLE image_info (id INTEGER PRIMARY KEY AUTOINCREMENT, filename STRING)')
    conn.commit()
    conn.close()
    print("table is successully created")

def insert_filenames(dbname, dir_name):
    """手書き文字画像のファイル名をデータベースに保存"""
    filenames = load_filenames(dir_name)
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    for filename in filenames:
        cur.execute('INSERT INTO image_info(filename) values(?)', (filename,))
    conn.commit()
    cur.close()
    conn.close()
    print("image file names are successully inserted")

def extract_filenames(dbname):
    """手書き文字画像のファイル名をデータベースから取得"""
    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    cur.execute( 'SELECT * FROM image_info')
    filenames = cur.fetchall()
    cur.close()
    conn.close()
    return filenames

create_table(dbname)
insert_filenames(dbname, dir_name)
extract_filenames(dbname)

### データの前処理をするコード

In [None]:
# p425、アウトプットを要確認
def load_filenames(dir_name, included_ext=INCLUDED_EXTENTION):
    """手書き文字画像が置いてあるパスからファイル名を取得し、リストを作成する関数"""
    files = []
    filenames = sorted(os.listdir(dir_name))
    for filename in filenames:
        base, ext = os.path.splitext(filename)
        if ext not in included_ext:
            continue
        files.append(filename)
    return files

def get_grayscale(dir_name):
    """読み込んだ手書き文字画像の色をグレースケールに変換する関数 (グレースケールは色の濃淡の明暗を分ける技法のことです。)"""
    filenames = load_filenames(dir_name)
    for filename in filenames:
        img = Image.open(f'{dir_name}/{filename}').convert('L')
        yield img

def get_shrinked_img(dir_name):
    """画像サイズを8×8ピクセルのサイズに統一し、明るさも16階調のグレイスケールで白黒に変換する関数"""
    img_test = np.empty((0, 64))
    crop_size = 8
    for img in get_grayscale(dir_name):
        img_data256 = 255 - np.array(img.resize((crop_size, crop_size)))
        min_bright, max_bright = img_data256.min(),  img_data256.max()
        img_data16 = (img_data256 - min_bright) / (max_bright - min_bright) * 16
        img_test = np.r_[img_test, img_data16.astype(np.uint8).reshape(1, -1)]
    return img_test

img_test = get_shrinked_img(dir_name)
get_shrinked_img(dir_name)

### データを学習/予測/計算するコード

In [None]:
import os
import numpy as np
from PIL import Image
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def load_filenames(dir_name, included_ext=INCLUDED_EXTENTION):
    """手書き文字画像が置いてあるパスからファイル名を取得し、リストを作成"""
    files = []
    filenames = sorted(os.listdir(dir_name))
    for filename in filenames:
        base, ext = os.path.splitext(filename)
        if ext not in included_ext:
            continue
        files.append(filename)
    return files

def create_logreg_model():
    """ロジスティック回帰の学習済みモデルを生成"""
    digits = load_digits()
    X = digits.data
    y = digits.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
    logreg = LogisticRegression(max_iter=2000)
    logreg_model = logreg.fit(X_train, y_train)
    return logreg_model

def evaluate_probs(dir_name, img_test, logreg_model):
    """テストデータを利用してロジスティック回帰の学習済みモデルのアウトプットを評価"""
    filenames = load_filenames(dir_name)
    X_true = [int(filename[:1]) for filename in filenames]  
    X_true = np.array(X_true)
    pred_logreg = logreg_model.predict(img_test)
    
    print('手書き文字の判別結果')
    print('観測結果:', X_true)
    print('予測結果:', pred_logreg)
    print('正解率:', logreg_model.score(img_test, X_true))
    return "Propability calculation is successfully finished"

logreg_model = create_logreg_model()
evaluate_probs(dir_name, img_test, logreg_model)

# ロジスティック回帰の学習済みモデルを生成

In [None]:
import pickle
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

logreg = LogisticRegression(max_iter=2000)
model = logreg.fit(X_train, y_train)
with open('model.pickle', mode='wb') as fp:
    pickle.dump(model, fp)