In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import tensorflow as tf

import os

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import Adam
from keras.layers import Input
from keras.models import Model
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint
import keras.backend as K
import keras

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('german')
import re
pd.set_option('display.precision',20)

print("Tensorflow version " + tf.__version__)


Tensorflow version 2.19.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\const\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\const\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df_train = pd.read_csv('dataset/training.txt', names=['id', 'lat','long','tweet'], header=None)
df_train.head()

Unnamed: 0,id,lat,long,tweet
0,119165,51.810067114093954,10.191330935251802,"Seit d Vase: ""Wenn ich kaputt gang, bringt das..."
1,100377,51.918187919463065,10.599244604316544,Haha bin au w isch der amig au so richtig lang...
2,109550,52.711073825503355,9.987374100719425,isch d hiltl dachterrasse amne samstig viel bs...
3,111440,52.38671140939596,11.700611510791369,Ich fühle mich wie die Weimarer Republik... .....
4,116670,52.31463087248321,9.701834532374104,Eui liebschte Lunchidee zum Mitneh? 😬 En Grill...


In [4]:
df_test = pd.read_csv('dataset/test.txt', names=['id','tweet'], header=None)
df_test.head()

Unnamed: 0,id,tweet
0,300121,👩min vibi funktionkert nöd... 👧hesch d'batteri...
1,302441,Ich: Also langsam söti scho schlafe Au Ich: He...
2,300266,Hez hie ou lüt wo dr ganz tag ine biudschirm m...
3,300911,je neui wohnig 😎 neua job 😎 eigeni kuchi 😎 abe...
4,302681,Schön wies grad chunt cho hagle u du nid d müg...


In [5]:
df_valid = pd.read_csv('dataset/validation.txt', names=['id', 'lat','long','tweet'], header=None)
df_valid.head()

Unnamed: 0,id,lat,long,tweet
0,203001,52.09838926174496,10.3544964028777,wenn mer anere Party bi Kollege en neue Bro fi...
1,200313,51.629865771812064,7.723453237410074,Heii guetä wuchestart gha? Jo wunderbar und du...
2,201966,52.242550335570456,8.967589928057553,Wieso ned? De werds eim emel ned langwiilig. O...
3,201123,52.242550335570456,8.967589928057553,Hani welle vo zueri uf lausanne denn hetts e d...
4,200374,51.70194630872484,8.416906474820145,Mir isch die Applikation plagiert worde as Dis...


In [6]:
def clean_text(input_text):
    text = re.sub(r'\W',' ', input_text) #Remove all non words
    text = re.sub(r'\d+',' ', text) #Remove all digits
    text = input_text.lower() #Converting text into lowercase
    text = re.sub(r'\s+[a-z]\s+',' ', text) #Remove all single letters
    text = re.sub(r'^\s+','', text) #Remove space from start of text
    text = re.sub(r'\s+$','', text) #Remove space from end of text
    text = re.sub(r'\s+',' ', text) #Remove all multi space
    text = text.split(' ') #Split the words into tokens
    text = [word for word in text if word not in stop_words] #Remove stopwords
    text = [WordNetLemmatizer().lemmatize(word) for word in text] #Lemmatize the words(get root form)
    text = ' '.join(text)

    return text

In [7]:
df_train['tweet'] = df_train.tweet.apply(lambda x: clean_text(x))

In [8]:
df_test['tweet'] = df_test.tweet.apply(lambda x: clean_text(x))

In [9]:
df_valid['tweet'] = df_valid.tweet.apply(lambda x: clean_text(x))

In [10]:
X = df_train['tweet']
y_lat = df_train['lat']/100
y_long = df_train['long']/100

X_valid = df_valid['tweet']
y_lat_valid = df_valid['lat']/100
y_long_valid = df_valid['long']/100

test = df_test['tweet']

In [11]:
string=''
for str in df_train['tweet']:
  string+=str+' '
spl = string.split()
print(len(spl))#toate cuvintele
myset = set(spl)
print(len(myset))#cuvinte distincte

979525
172542


In [12]:
num_words = 2000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df_train['tweet'].values)

In [13]:
X = tokenizer.texts_to_sequences(df_train['tweet'].values)
X = pad_sequences(X, maxlen=num_words)

X_valid = tokenizer.texts_to_sequences(df_valid['tweet'].values)
X_valid = pad_sequences(X_valid, maxlen=num_words)

test = tokenizer.texts_to_sequences(df_test['tweet'].values)
test = pad_sequences(test, maxlen=num_words)

In [14]:
embed_dim = 128
lstm_out = 192

In [15]:
from keras.layers import Dropout
def create_model():
  input_tensor = Input(shape=(num_words, ))

  #hidden=Embedding(num_words, embed_dim, input_length = X.shape[1])(input_tensor)
  #hidden=LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2)(hidden)

  # Create the first output
  hidden_1=Embedding(num_words, embed_dim, input_length = X.shape[1])(input_tensor)
  hidden_1=LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2)(hidden_1)
  hidden_1=Dense(500, activation='relu')(hidden_1)
  hidden_1=Dense(500, activation='relu')(hidden_1)
  output_tensor_1 = Dense(1, activation='relu', use_bias=False, kernel_initializer='normal')(hidden_1)

  # Create the second output
  #hidden_2=Embedding(num_words, embed_dim)(input_tensor)
  #hidden_2=LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2)(hidden_2)
  hidden_2=Dense(500, activation='relu')(output_tensor_1)
  hidden_2=Dense(500, activation='relu')(hidden_2)
  hidden_2=Dense(500, activation='relu')(hidden_2)
  output_tensor_2 = Dense(1)(hidden_2)

  # Create a model with 2 outputs
  model = Model(input_tensor, [output_tensor_1,output_tensor_2])

  optimizer = Adam()
  model.compile(loss=['mean_absolute_error','mean_absolute_error'],optimizer=optimizer)
  return model

In [16]:
model = create_model()



In [17]:
filepath='cp_keras.h5'
dirpath=os.path.dirname(filepath)
checkpoint=ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True)

In [18]:
# train the keras model on the dataset
model.fit(X, [y_lat,y_long], epochs=1, validation_data=(X_valid, [y_long_valid,y_lat_valid]),callbacks=[checkpoint],shuffle=True)

[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - dense_2_loss: 0.0327 - dense_6_loss: 0.0158 - loss: 0.0485
Epoch 1: val_loss improved from None to 0.84318, saving model to cp_keras.h5




[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2867s[0m 4s/step - dense_2_loss: 0.0137 - dense_6_loss: 0.0122 - loss: 0.0259 - val_dense_2_loss: 0.4220 - val_dense_6_loss: 0.4213 - val_loss: 0.8432


<keras.src.callbacks.history.History at 0x17c330f3890>

In [19]:
model = tf.keras.models.load_model(filepath)



In [20]:
# summarize layers
print(model.summary())

None


In [24]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [25]:
predictions_valid = model.predict(X_valid)

[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 520ms/step


In [26]:
df_valid['predicted_lat']=predictions_valid[0].max(axis=1)

In [27]:
df_valid['predicted_long']=predictions_valid[1].max(axis=1)

In [28]:
print(model.evaluate(X_valid,
                     [y_lat_valid,y_long_valid],
                     verbose=False))

[0.016810571774840355, 0.006307543721050024, 0.010465609841048717]


In [29]:
df_valid

Unnamed: 0,id,lat,long,tweet,predicted_lat,predicted_long
0,203001,52.09838926174496265276,10.35449640287770023406,mer anere party bi kollege en neue bro findt h...,0.52146583795547485352,0.09638313204050064087
1,200313,51.62986577181206371279,7.72345323741007305784,heii guetä wuchestart gha? jo wunderbar du? ja...,0.51711761951446533203,0.09637607634067535400
2,201966,52.24255033557045635462,8.96758992805755283939,wieso ned? de werds eim emel ned langwiilig. o...,0.51607179641723632812,0.09637437760829925537
3,201123,52.24255033557045635462,8.96758992805755283939,hani welle vo zueri uf lausanne hetts durchsag...,0.51934522390365600586,0.09637968987226486206
4,200374,51.70194630872484253814,8.41690647482014497882,isch applikation plagiert worde a diskussionsf...,0.52085357904434204102,0.09638213366270065308
...,...,...,...,...,...,...
3039,200754,52.62047945205478072239,10.29454545454545666416,git hüt schöns mache? 😄 (elei haha) uf em sofa...,0.51792967319488525391,0.09637739509344100952
3040,201772,52.84116438356164735524,7.80318181818181955123,grad sm vo salt beko: unglaubliches angebot: g...,0.51996272802352905273,0.09638069570064544678
3041,201431,51.88486301369860598243,8.18977272727272520569,lauft chestehouz uf em hoger obe?🤔 voll schien...,0.51891160011291503906,0.09637898951768875122
3042,202066,51.77452054794520108771,9.65022727272727287584,glaub scho chli en psycho liebs go jogge zgah ...,0.51548910140991210938,0.09637343883514404297


In [30]:
predictions_test = model.predict(test)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 477ms/step


In [31]:
df_test['lat']=predictions_test[0].max(axis=1)
df_test['long']=predictions_test[1].max(axis=1)

In [32]:
df_test=df_test.drop('tweet', axis=1)

In [33]:
print(df_test)

          id                     lat                    long
0     300121  0.51732438802719116211  0.09637641906738281250
1     302441  0.51114243268966674805  0.09636639058589935303
2     300266  0.50165534019470214844  0.09635099768638610840
3     300911  0.51744657754898071289  0.09637661278247833252
4     302681  0.51230037212371826172  0.09636826813220977783
...      ...                     ...                     ...
3133  300151  0.51738721132278442383  0.09637651592493057251
3134  300302  0.51961028575897216797  0.09638011455535888672
3135  301963  0.51894670724868774414  0.09637904167175292969
3136  302180  0.50940078496932983398  0.09636355936527252197
3137  301697  0.52185487747192382812  0.09638375788927078247

[3138 rows x 3 columns]


In [34]:
df_test.to_csv('/content/drive/MyDrive/knn folder/rezultatKerasModel1.txt', index=False, decimal='.', sep=',', float_format='%.20f')

OSError: Cannot save file into a non-existent directory: '\content\drive\MyDrive\knn folder'