<a href="https://colab.research.google.com/github/Merostoroloji/VC/blob/main/TextAnalys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
from sklearn.datasets import fetch_20newsgroups

documents_train = fetch_20newsgroups(
    subset = "train",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
)

documents_test = fetch_20newsgroups(
    subset = "test",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

vectorizer.fit(documents_train["data"])

X_train = vectorizer.transform(documents_train["data"])
X_test = vectorizer.transform(documents_test["data"])

y_train = documents_train["target"]
y_test = documents_test["target"]

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

mdl = RandomForestClassifier()
mdl.fit(X_train, y_train)

ypred = mdl.predict(X_test)

print(classification_report(y_test, ypred, target_names= documents_train["target_names"]))

                precision    recall  f1-score   support

 comp.graphics       0.72      0.84      0.77       389
comp.windows.x       0.89      0.74      0.81       395
     rec.autos       0.87      0.92      0.89       396
     sci.space       0.95      0.90      0.93       394

      accuracy                           0.85      1574
     macro avg       0.86      0.85      0.85      1574
  weighted avg       0.86      0.85      0.85      1574



In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf = False)

tf_transformer.fit(X_train)

X_train_tf = tf_transformer.transform(X_train)
X_test_tf = tf_transformer.transform(X_test)

mdl.fit(X_train_tf, y_train)

ypred = mdl.predict(X_test_tf)

print(classification_report(y_test, ypred, target_names= documents_train["target_names"]))

                precision    recall  f1-score   support

 comp.graphics       0.72      0.84      0.77       389
comp.windows.x       0.89      0.77      0.82       395
     rec.autos       0.88      0.92      0.90       396
     sci.space       0.94      0.87      0.91       394

      accuracy                           0.85      1574
     macro avg       0.86      0.85      0.85      1574
  weighted avg       0.86      0.85      0.85      1574



In [6]:
tfidf_transformer = TfidfTransformer(use_idf = True)

tfidf_transformer.fit(X_train)

X_train_tfidf = tfidf_transformer.transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)

mdl.fit(X_train_tfidf, y_train)

ypred = mdl.predict(X_test_tfidf)

print(classification_report(y_test, ypred, target_names= documents_train["target_names"]))

                precision    recall  f1-score   support

 comp.graphics       0.70      0.83      0.76       389
comp.windows.x       0.87      0.73      0.79       395
     rec.autos       0.89      0.92      0.90       396
     sci.space       0.95      0.89      0.92       394

      accuracy                           0.84      1574
     macro avg       0.85      0.84      0.84      1574
  weighted avg       0.85      0.84      0.84      1574



In [7]:
text_vectorization = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "tf_idf"
)

data = ["Ali ata bak",
        "Ayşe topu tut",
        "Ali koş, Ayşe koş"]

text_vectorization.adapt(data)
text_vectorization.get_vocabulary()


['[UNK]', 'koş', 'ayşe', 'ali', 'tut', 'topu', 'bak', 'ata']

In [8]:
pd.DataFrame(text_vectorization(data).numpy(), columns = text_vectorization.get_vocabulary())

Unnamed: 0,[UNK],koş,ayşe,ali,tut,topu,bak,ata
0,0.0,0.0,0.0,0.693147,0.0,0.0,0.916291,0.916291
1,0.0,0.0,0.693147,0.0,0.916291,0.916291,0.0,0.0
2,0.0,1.832582,0.693147,0.693147,0.0,0.0,0.0,0.0


In [12]:
text_vectorization = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "tf_idf"
)
text_vectorization.adapt(documents_train["data"])

X_train = text_vectorization(documents_train["data"])
X_test = text_vectorization(documents_test["data"])

mdl = keras.Sequential()
mdl.add(layers.Dense(128, activation = "relu", input_shape = (X_train.shape[1], ) ))
mdl.add(layers.Dense(64, activation = "relu" ))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
mdl.fit(X_train, y_train, epochs = 10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c5c9860eb30>

In [13]:
mdl.evaluate(X_test, y_test)



[0.3964623510837555, 0.9047014117240906]

In [28]:
text_vectorization = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "int",
    output_sequence_length = 10
)
text_vectorization.adapt(documents_train["data"])
X_train = text_vectorization(documents_train["data"])
X_test = text_vectorization(documents_test["data"])

X_train = tf.one_hot(X_train, depth = 10000)
X_test = tf.one_hot(X_test, depth = 10000)
X_train.shape

TensorShape([2364, 10, 10000])

In [29]:
mdl = keras.Sequential()
mdl.add(layers.LSTM(128, input_shape = X_train.shape[1:]))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(64, activation = "relu" ))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
mdl.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c5c8e382a70>

In [30]:
text_vectorization = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "int",
    output_sequence_length = 100
)
text_vectorization.adapt(documents_train["data"])
X_train = text_vectorization(documents_train["data"])
X_test = text_vectorization(documents_test["data"])


In [20]:
text_vectorization = layers.TextVectorization(
    max_tokens = 10000,
    output_mode = "int",
    output_sequence_length = 10
)
text_vectorization.adapt(documents_train["data"])

X_train = text_vectorization(documents_train["data"])
X_test = text_vectorization(documents_test["data"])

X_train.shape

TensorShape([2364, 10])

In [21]:
X_train =tf.one_hot(X_train, depth = 10000)
X_test =tf.one_hot(X_test, depth = 10000)

In [22]:
X_train.shape

TensorShape([2364, 10, 10000])

In [23]:
mdl = keras.Sequential()
mdl.add(layers.LSTM(128,input_shape =X_train.shape[1:]  ))
mdl.add(layers.Dense(128, activation = "relu", input_shape = (X_train.shape[1], ) ))
mdl.add(layers.Dense(64, activation = "relu" ))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
mdl.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c5c8cbc2a70>

In [24]:
mdl.evaluate(X_test, y_test)



[1.136893391609192, 0.7757306098937988]

In [32]:
mdl = keras.Sequential()
mdl.add(layers.Embedding(input_dim = 10000, output_dim = 200))
mdl.add(layers.LSTM(128))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(64, activation = "relu" ))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
mdl.fit(X_train, y_train, epochs = 10, validation_split = 0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c5c9018aaa0>

In [None]:
#

# **GORUNTU**

In [None]:
def make_subset(subset_name, start_index, end_index):
    for category in ("Cat", "Dog"):
        dir = new_dir / subset_name / category
        os.makedirs(dir, exist_ok =  True)
        fnames = [f"{i}.jpg" for i in range(start_index,end_index)]
        for fname in fnames:
            shutil.copyfile(base_dir/category/fname, new_dir/subset_name/category/fname)

make_subset(subset_name = "train", start_index = 0, end_index = 1000)
make_subset(subset_name = "val", start_index = 1000, end_index = 1500)
make_subset(subset_name = "test", start_index = 1500, end_index = 2500)

In [None]:
def clean_data(subset_name):
    deleted = 0
    for category in ["Cat", "Dog"]:
        fnames = os.listdir(new_dir/ subset_name / category)
        for fname in fnames:
            try:
                img_bytes = tf.io.read_file(str(new_dir/ subset_name / category / fname))
                decoded_img = tf.io.decode_image(img_bytes)
            except:
                os.remove(new_dir/ subset_name / category / fname)
                deleted += 1

    print(deleted, "number of files")

In [None]:
def clean_data(subset_name):
    deleted = 0
    for category in ["Cat", "Dog"]:
        fnames = os.listdir(new_dir/ subset_name / category)
        for fname in fnames:
            try:
                img_bytes = tf.io.read_file(str(new_dir/ subset_name / category / fname))
                decoded_img = tf.io.decode_image(img_bytes)
            except:
                os.remove(new_dir/ subset_name / category / fname)
                deleted += 1

    print(deleted, "number of files")

clean_data("train")
clean_data("val")
clean_data("test")

In [None]:
from tensorflow.keras.utils import image_dataset_from_directory


train_data = image_dataset_from_directory(
    "/content/Images/train",
    image_size = (180,180),
    batch_size = 32
)

val_data = image_dataset_from_directory(
    "/content/Images/val",
    image_size = (180,180),
    batch_size = 32
)

test_data = image_dataset_from_directory(
    "/content/Images/test",
    image_size = (180,180),
    batch_size = 32
)

In [None]:
mdl = keras.Sequential()
mdl.add(layers.Rescaling(1/255.0, input_shape = (180,180,3)))
mdl.add(layers.Conv2D(128, kernel_size = (3,3), activation = "relu"))
mdl.add(layers.MaxPool2D(pool_size = (2,2)))
mdl.add(layers.Conv2D(64, kernel_size = (3,3), activation = "relu"))
mdl.add(layers.MaxPool2D(pool_size = (2,2)))
mdl.add(layers.Flatten())
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(1, activation = "sigmoid"))

mdl.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
mdl.fit(train_data, epochs = 10, validation_data = val_data)

In [None]:
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.2)
    ]
)

In [None]:
vgg = keras.applications.vgg16.VGG16(
    weights = "imagenet",
    include_top = False
)
vgg.trainable = False

In [None]:
mdl = keras.Sequential()
mdl.add(layers.Input(shape = (180,180,3)))
mdl.add(vgg)
mdl.add(layers.Flatten())
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(1, activation = "sigmoid"))

mdl.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
mdl.fit(train_data, epochs = 10, validation_data = val_data)